In [1]:
import torch
from torch import nn

In [6]:
%%timeit
for _ in range(100):
    # Creating on the CPU, then transfering to the GPU
    cpu_tensor = torch.ones(1000, 64, 64)
    gpu_tensor = cpu_tensor.cuda()

481 ms ± 20.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
%%timeit
for _ in range(100):
    # Creating on the GPU
    gpu_tensor = torch.ones((1000, 64, 64), device='cuda')

14 ms ± 640 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [7]:
# Use Sequential layers when possible

In [None]:
self.mid_layers = []
for _ in range(5):
    self.mid_layers.append(nn.Linear(64, 64))
    self.mid_layers.append(nn.ReLU())
self.mid_layers = nn.Sequential(*self.mid_layers)

In [None]:
losses = []

for batch in data_batches:
    output = example_model(batch)
    target = torch.rand((10, 3))
    loss = criterion(output, target)
    losses.append(loss.detach())  # Or loss.item() if you don't need the gradient

print(losses)

In [None]:
# Trick to delete a model from GPU
example_model = ExampleModel().cuda()

del example_model

import gc
gc.collect()
torch.cuda.empty_cache()
# Add from fastai notebook

In [None]:
# Mixed precision training uses both 16-bit and 32-bit floating-point types in a model during training to make it run faster
# and use less memory. By keeping certain parts of the model in the 32-bit types for numerical stability, the model will have a 
# lower step time and train equally as well, with minor changes to the code.

from torch.cuda.amp import autocast, GradScaler

model = YourModel().cuda()
optimizer = torch.optim.Adam(model.parameters())
scaler = GradScaler()

for inputs, targets in dataloader:
    optimizer.zero_grad()
    with autocast():
        loss = criterion(model(inputs), targets)
    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()

In [None]:
# Lazy Loading
# Lazy loading is a design approach frequently used in machine learning and data-heavy applications, particularly when the full 
# dataset is too large to fit into memory. 
# This technique involves loading data into memory only as it's required during the execution of the program, rather than importing the 
# entire dataset all at once.
# Use cases for lazy loading:
#    Large Datasets: When the dataset is too large to fit into memory.
#    Dynamic Data: When the dataset is continuously updated or changes over time.
#    Random Access: When random (or pseudo-random) access to data points is acceptable, which is often the case in stochastic gradient descent algorithms.
#    Streamed Data: When data can be streamed from distributed file systems, databases, or even online sources.
#    Memory Efficiency: When you want to optimize the program to use as little memory as possible.
# In PyTorch, lazy loading can be achieved with the use of the DataLoader class

from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
import torchvision.transforms as transforms

transform = transforms.Compose([
    # add whatever transformations you need
    transforms.ToTensor(),
])

dataset = ImageFolder(root='path/to/your/images', transform=transform)

data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
# Gradient Clipping
# Gradient clipping is a technique to prevent the gradients from becoming too large, which can lead to exploding gradient problems especially 
# in recurrent neural networks (RNNs). This can be particularly useful when you are observing NaNs during training or when the losses go to infinity.

# PyTorch provides a simple utility called torch.nn.utils.clip_grad_norm_ which can be used to clip the gradients of model parameters.
# Here's how to use it:

from torch.nn.utils import clip_grad_norm_

# define your model and optimizer
model = YourModel()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for inputs, targets in dataloader:
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    loss.backward()
    
    # Clip gradients after computing the backward pass and before the optimization step
    clip_grad_norm_(model.parameters(), max_norm=1.0)
    
    optimizer.step()

In [None]:
# Quantization
# Quantization is a process used to reduce the computational and storage burdens of machine learning models by converting weights from floating 
# point representations to lower precision, such as int8 or int16. This has the benefit of reducing memory requirements and speeding up model 
# inference time, with a potential trade-off in model performance due to the reduced precision.

# PyTorch's quantization API provides tools for post-training quantization (quantization after the model has been trained), quantization-aware
# training (where quantization is considered during the training process itself), and dynamic quantization (only quantizes certain parts of the model, 
# typically the weights).
# Here's a very simplified example of how to use PyTorch's dynamic quantization on a BERT model:

from transformers import BertModel
import torch

# Load pre-trained model
model = BertModel.from_pretrained('bert-base-uncased')

# Convert to torchscript and quantize model
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

# Save quantized model
torch.jit.save(torch.jit.script(quantized_model), "quantized_bert.pt")

In [None]:
# Method named_parameters()

# Instantiate a pre-trained model
transfer_model = resnet50(pretrained=True)

# Freeze the weights of the model
for name, param in transfer_model.named_parameters():
	param.requires_grad = False

In [None]:
# Auto-Scale Batch Size
# Auto-scaling the batch size is a technique to automatically find the largest batch size that fits into memory for a given model and data.
# This is beneficial because larger batch sizes can result in faster training times due to more efficient use of hardware resources (GPU).
# However, the downside is that they can also result in memory overflow errors if the batch size is too large for the GPU to handle.
# Within PyTorch framework, TOMA is an approach that can be used for auto-scaling batch sizes. Specifically, it retries code that fails due to OOM 
# (out-of-memory) conditions and lowers batch sizes automatically (e.g. from 512, to 256, then 128, etc.). To avoid failing over repeatedly, a 
# simple cache is implemented that memorizes that last successful batchsize given the call and available free memory.
# Link: https://github.com/BlackHC/toma 

!pip install toma

from toma import toma

@toma.batch(initial_batchsize=512)
def run_inference(batchsize, model, dataset):
	# your inference code

run_inference(batchsize, model, dataset)

In [None]:
# Need explanation

optimizer = ...
NUM_ACCUMULATION_STEPS = ...
for epoch in range(...):
    for idx, sample in enumerate(dataloader):
        inputs, labels = sample

        # Forward Pass
        outputs = model(inputs)
        # Compute Loss and Perform Back-propagation
        loss = loss_fn(outputs, labels)

        # Normalize the Gradients
        loss = loss / NUM_ACCUMULATION_STEPS
        loss.backward()
        if (
            ((idx + 1) % NUM_ACCUMULATION_STEPS == 0) 
            or (idx + 1 == len(dataloader))
        ):
            # Update Optimizer
            optimizer.step()
            optimizer.zero_grad()

In [None]:
# Using Hooks
# Hooks in PyTorch allow you to modify or monitor the forward and backward passes in a neural network.
# They are essentially callback functions that can be registered on nn.Module instances, including both layers and entire models.
# These callbacks get executed when the forward or backward pass runs through the module. 
# Use cases:
# Debugging: Inspect values within the network.
# Gradient Clipping or Modification: Modify gradients during backpropagation, e.g., to prevent gradient explosion.
# Feature Extraction: Extract the output of intermediate layers for analysis or other tasks like transfer learning.
# Resource Optimization: Monitor resource usage dynamically during training.

# Forward Hook
def forward_hook(module, input, output):
    print('Inside forward hook.')
layer = nn.Linear(2, 2)
hook1 = layer.register_forward_hook(forward_hook)

# Backward Hook
def backward_hook(module, grad_input, grad_output):
    print('Inside backward hook.')
hook2 = layer.register_backward_hook(backward_hook)

# Removing Hooks
hook1.remove()
hook2.remove()