In [1]:
import torch
from torch import nn
import random
import numpy as np
import os

In [2]:
# For reproducible results
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(True)
    #torch.set_deterministic_debug_mode(True)
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
    
seed_everything(0)

device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
%%timeit
for _ in range(100):
    # Creating on the CPU, then transfering to the GPU
    cpu_tensor = torch.ones(1000, 64, 64)
    gpu_tensor = cpu_tensor.to("cuda")

632 ms ± 4.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [4]:
%%timeit
for _ in range(100):
    # Creating on the GPU
    gpu_tensor = torch.ones((1000, 64, 64), device="cuda")

28.1 ms ± 899 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [5]:
%%timeit
for _ in range(100):
    # Creating on the CPU, then converting to half precision
    f32_tensor = torch.ones(1000, 64, 64)
    f16_tensor = f32_tensor.half()

616 ms ± 5.01 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
%%timeit
for _ in range(100):
    # Creating on the CPU in half precision
    f16_tensor = torch.ones((1000, 64, 64), dtype=torch.float16)

179 ms ± 1.45 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [7]:
%%timeit
for _ in range(100):
    # Creating on the GPU, then converting to half precision
    f32_tensor = torch.ones((1000, 64, 64), device="cuda")
    f16_tensor = f32_tensor.half()

57.5 ms ± 6.13 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [8]:
%%timeit
for _ in range(100):
    # Creating on the GPU in half precision
    f16_tensor = torch.ones((1000, 64, 64), dtype=torch.float16, device="cuda")

14.1 ms ± 911 ns per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [7]:
# Use Sequential layers when possible but debugging is harder

In [None]:
self.mid_layers = []
for _ in range(5):
    self.mid_layers.append(nn.Linear(64, 64))
    self.mid_layers.append(nn.ReLU())
self.mid_layers = nn.Sequential(*self.mid_layers)

In [None]:
losses = []

for batch in data_batches:
    output = example_model(batch)
    target = torch.rand((10, 3))
    loss = criterion(output, target)
    losses.append(loss.detach())  # Or loss.item() if you don't need the gradient

print(losses)

In [None]:
# Trick to delete a model from GPU
example_model = ExampleModel().cuda()

del example_model

import gc
gc.collect()
with torch.no_grad():
    torch.cuda.empty_cache()
# Added from fastai notebook

In [None]:
# Mixed precision training uses both 16-bit and 32-bit floating-point types in a model during training to make it run faster
# and use less memory. By keeping certain parts of the model in the 32-bit types for numerical stability, the model will have a 
# lower step time and train equally as well, with minor changes to the code.

from torch.cuda.amp import autocast, GradScaler

model = YourModel().cuda()
optimizer = torch.optim.Adam(model.parameters())
scaler = GradScaler()

for inputs, targets in dataloader:
    optimizer.zero_grad()
    with autocast():
        loss = criterion(model(inputs), targets)
    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()

In [None]:
# Lazy Loading
# Lazy loading is a design approach frequently used in machine learning and data-heavy applications, particularly when the full 
# dataset is too large to fit into memory. 
# This technique involves loading data into memory only as it's required during the execution of the program, rather than importing the 
# entire dataset all at once.
# Use cases for lazy loading:
#    Large Datasets: When the dataset is too large to fit into memory.
#    Dynamic Data: When the dataset is continuously updated or changes over time.
#    Random Access: When random (or pseudo-random) access to data points is acceptable, which is often the case in stochastic gradient descent algorithms.
#    Streamed Data: When data can be streamed from distributed file systems, databases, or even online sources.
#    Memory Efficiency: When you want to optimize the program to use as little memory as possible.
# In PyTorch, lazy loading can be achieved with the use of the DataLoader class

from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
import torchvision.transforms as transforms

transform = transforms.Compose([
    # add whatever transformations you need
    transforms.ToTensor(),
])

dataset = ImageFolder(root='path/to/your/images', transform=transform)

data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
# Gradient Clipping
# Gradient clipping is a technique to prevent the gradients from becoming too large, which can lead to exploding gradient problems especially 
# in recurrent neural networks (RNNs). This can be particularly useful when you are observing NaNs during training or when the losses go to infinity.

# PyTorch provides a simple utility called torch.nn.utils.clip_grad_norm_ which can be used to clip the gradients of model parameters.
# Here's how to use it:

from torch.nn.utils import clip_grad_norm_

# define your model and optimizer
model = YourModel()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for inputs, targets in dataloader:
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    loss.backward()
    
    # Clip gradients after computing the backward pass and before the optimization step
    clip_grad_norm_(model.parameters(), max_norm=1.0)
    
    optimizer.step()

In [None]:
# Quantization
# Quantization is a process used to reduce the computational and storage burdens of machine learning models by converting weights from floating 
# point representations to lower precision, such as int8 or int16. This has the benefit of reducing memory requirements and speeding up model 
# inference time, with a potential trade-off in model performance due to the reduced precision.

# PyTorch's quantization API provides tools for post-training quantization (quantization after the model has been trained), quantization-aware
# training (where quantization is considered during the training process itself), and dynamic quantization (only quantizes certain parts of the model, 
# typically the weights).
# Here's a very simplified example of how to use PyTorch's dynamic quantization on a BERT model:

from transformers import BertModel
import torch

# Load pre-trained model
model = BertModel.from_pretrained('bert-base-uncased')

# Convert to torchscript and quantize model
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

# Save quantized model
torch.jit.save(torch.jit.script(quantized_model), "quantized_bert.pt")

In [None]:
# Method named_parameters()

# Instantiate a pre-trained model
transfer_model = resnet50(pretrained=True)

# Freeze the weights of the model
for name, param in transfer_model.named_parameters():
	param.requires_grad = False

In [None]:
# Auto-Scale Batch Size
# Auto-scaling the batch size is a technique to automatically find the largest batch size that fits into memory for a given model and data.
# This is beneficial because larger batch sizes can result in faster training times due to more efficient use of hardware resources (GPU).
# However, the downside is that they can also result in memory overflow errors if the batch size is too large for the GPU to handle.
# Within PyTorch framework, TOMA is an approach that can be used for auto-scaling batch sizes. Specifically, it retries code that fails due to OOM 
# (out-of-memory) conditions and lowers batch sizes automatically (e.g. from 512, to 256, then 128, etc.). To avoid failing over repeatedly, a 
# simple cache is implemented that memorizes that last successful batchsize given the call and available free memory.
# Link: https://github.com/BlackHC/toma 

!pip install toma

from toma import toma

@toma.batch(initial_batchsize=512)
def run_inference(batchsize, model, dataset):
	# your inference code

run_inference(batchsize, model, dataset)

In [None]:
# Need explanation

optimizer = ...
NUM_ACCUMULATION_STEPS = ...
for epoch in range(...):
    for idx, sample in enumerate(dataloader):
        inputs, labels = sample

        # Forward Pass
        outputs = model(inputs)
        # Compute Loss and Perform Back-propagation
        loss = loss_fn(outputs, labels)

        # Normalize the Gradients
        loss = loss / NUM_ACCUMULATION_STEPS
        loss.backward()
        if (
            ((idx + 1) % NUM_ACCUMULATION_STEPS == 0) 
            or (idx + 1 == len(dataloader))
        ):
            # Update Optimizer
            optimizer.step()
            optimizer.zero_grad()

In [None]:
# Using Hooks
# Hooks in PyTorch allow you to modify or monitor the forward and backward passes in a neural network.
# They are essentially callback functions that can be registered on nn.Module instances, including both layers and entire models.
# These callbacks get executed when the forward or backward pass runs through the module. 
# Use cases:
# Debugging: Inspect values within the network.
# Gradient Clipping or Modification: Modify gradients during backpropagation, e.g., to prevent gradient explosion.
# Feature Extraction: Extract the output of intermediate layers for analysis or other tasks like transfer learning.
# Resource Optimization: Monitor resource usage dynamically during training.

# Forward Hook
def forward_hook(module, input, output):
    print('Inside forward hook.')
layer = nn.Linear(2, 2)
hook1 = layer.register_forward_hook(forward_hook)

# Backward Hook
def backward_hook(module, grad_input, grad_output):
    print('Inside backward hook.')
hook2 = layer.register_backward_hook(backward_hook)

# Removing Hooks
hook1.remove()
hook2.remove()

In [2]:
x = torch.rand(5)
x

tensor([0.3019, 0.6283, 0.5609, 0.7612, 0.6735])

In [3]:
y = torch.rand(4, 8, 5)
y.shape

torch.Size([4, 8, 5])

In [5]:
z = x.expand_as(y)
print(z.shape, z, sep='\n')

torch.Size([4, 8, 5])
tensor([[[0.3019, 0.6283, 0.5609, 0.7612, 0.6735],
         [0.3019, 0.6283, 0.5609, 0.7612, 0.6735],
         [0.3019, 0.6283, 0.5609, 0.7612, 0.6735],
         [0.3019, 0.6283, 0.5609, 0.7612, 0.6735],
         [0.3019, 0.6283, 0.5609, 0.7612, 0.6735],
         [0.3019, 0.6283, 0.5609, 0.7612, 0.6735],
         [0.3019, 0.6283, 0.5609, 0.7612, 0.6735],
         [0.3019, 0.6283, 0.5609, 0.7612, 0.6735]],

        [[0.3019, 0.6283, 0.5609, 0.7612, 0.6735],
         [0.3019, 0.6283, 0.5609, 0.7612, 0.6735],
         [0.3019, 0.6283, 0.5609, 0.7612, 0.6735],
         [0.3019, 0.6283, 0.5609, 0.7612, 0.6735],
         [0.3019, 0.6283, 0.5609, 0.7612, 0.6735],
         [0.3019, 0.6283, 0.5609, 0.7612, 0.6735],
         [0.3019, 0.6283, 0.5609, 0.7612, 0.6735],
         [0.3019, 0.6283, 0.5609, 0.7612, 0.6735]],

        [[0.3019, 0.6283, 0.5609, 0.7612, 0.6735],
         [0.3019, 0.6283, 0.5609, 0.7612, 0.6735],
         [0.3019, 0.6283, 0.5609, 0.7612, 0.6735],
     

In [None]:
# PyTorch v0.2.0 relase

#General Semantics

#Two tensors are “broadcastable” if the following rules hold:

#   Each tensor has at least one dimension.
#   When iterating over the dimension sizes, starting at the trailing dimension, the dimension sizes must either be equal, 
#   one of them is 1, or one of them does not exist.

#For Example:

x=torch.FloatTensor(5,7,3)
y=torch.FloatTensor(5,7,3)
# same shapes are always broadcastable (i.e. the above rules always hold)

# can line up trailing dimensions
x=torch.FloatTensor(5,3,4,1)
y=torch.FloatTensor(  3,1,1)

# x and y are broadcastable.
# 1st trailing dimension: both have size 1
# 2nd trailing dimension: y has size 1
# 3rd trailing dimension: x size == y size
# 4th trailing dimension: y dimension doesn't exist

# but:
x=torch.FloatTensor(5,2,4,1)
y=torch.FloatTensor(  3,1,1)
# x and y are not broadcastable, because in the 3rd trailing dimension 2 != 3

torch.add(torch.ones(4,1), torch.randn(4))

# would previously produce a Tensor with size: torch.Size([4,1]),
# but now produces a Tensor with size: torch.Size([4,4]).

In [6]:
# If two tensors x, y are "broadcastable", the resulting tensor size is calculated as follows:

#   If the number of dimensions of x and y are not equal, prepend 1 to the dimensions of the tensor with fewer dimensions to make them equal length.
#   Then, for each dimension size, the resulting dimension size is the max of the sizes of x and y along that dimension.

# For Example:

# can line up trailing dimensions to make reading easier
x=torch.FloatTensor(5,1,4,1)
y=torch.FloatTensor(  3,1,1)
(x+y).size()

torch.Size([5, 3, 4, 1])

In [7]:
# error case
x=torch.FloatTensor(5,2,4,1)
y=torch.FloatTensor(  3,1,1)
(x+y).size()

RuntimeError: The size of tensor a (2) must match the size of tensor b (3) at non-singleton dimension 1

In [None]:
x = torch.Tensor(5, 5, 5)

# Pure Integer Array Indexing - specify arbitrary indices at each dimension

x[[1, 2], [3, 2], [1, 0]]
#--> yields a 2-element Tensor (x[1][3][1], x[2][2][0])

# also supports broadcasting, duplicates

x[[2, 3, 2], [0], [1]]
#--> yields a 3-element Tensor (x[2][0][1], x[3][0][1], x[2][0][1])

# arbitrary indexer shapes allowed

x[[[1, 0], [0, 1]], [0], [1]].shape
#--> yields a 2x2 Tensor [[x[1][0][1], x[0][0][1]],
#                         [x[0][0][1], x[1][0][1]]]

# can use colon, ellipse

x[[0, 3], :, :]
x[[0, 3], ...]
#--> both yield a 2x5x5 Tensor [x[0], x[3]]

# also use Tensors to index!

y = torch.LongTensor([0, 2, 4])
x[y, :, :]
#--> yields a 3x5x5 Tensor [x[0], x[2], x[4]]

# selection with less than ndim, note the use of comma

x[[1, 3], ]
#--> yields a 2x5x5 Tensor [x[1], x[3]]

In [None]:
# Distributed PyTorch

# Wrap model in DistributedDataParallel (CUDA only for the moment)
model = torch.nn.parallel.DistributedDataParallel(model.cuda())

# Use a DistributedSampler to restrict each process to a distinct subset
# of the dataset.
train_dataset = ...
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=args.batch_size, num_workers=args.workers,
    pin_memory=True, sampler=train_sampler)

for epoch in range(args.num_epochs):
    # Use .set_epoch() method to reshuffle the dataset partition at every iteration
    train_sampler.set_epoch(epoch)
    # training loop
    ...

In [1]:
# Numerically stable Binary Cross-Entropy loss via bce_with_logits.

In [None]:
# PyTorc 0.3.0
"""
Unreduced losses

Now, Some loss functions can compute per-sample losses in a mini-batch

    By default PyTorch sums losses over the mini-batch and returns a single scalar loss. This was limiting to users.
    Now, a subset of loss functions allow specifying reduce=False to return individual losses for each sample in the mini-batch
    Example: loss = nn.CrossEntropyLoss(..., reduce=False)
    Currently supported losses: MSELoss, NLLLoss, NLLLoss2d, KLDivLoss, CrossEntropyLoss, SmoothL1Loss, L1Loss
"""

In [7]:
x = torch.randn((1, 1), requires_grad=True)

with torch.autograd.profiler.profile() as prof:
    y = x ** 2
    y.backward()

print(prof.key_averages().table(sort_by="cpu_time_total"))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
      autograd::engine::evaluate_function: PowBackward0         3.97%      15.000us        49.74%     188.000us     188.000us             1  
                                           PowBackward0         6.88%      26.000us        45.77%     173.000us     173.000us             1  
                                              aten::pow        37.04%     140.000us        43.92%     166.000us      83.000us             2  
                                              aten::mul        13.23%      50.000us        20.63%      78.000us      39.000us             2  
      

STAGE:2023-10-26 16:49:51 33993:33993 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2023-10-26 16:49:51 33993:33993 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2023-10-26 16:49:51 33993:33993 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


In [9]:
from torchvision import models

model = models.resnet18()
x = torch.randn(1, 3, 224, 224)

In [14]:
!nvprof --profile-from-start off -o trace_name.prof -- python model_test.py

==36048== NVPROF is profiling process 36048, command: python model_test.py
==36048== Generated result file: /home/cosmos/Documents/Practices/ML-Practices/PyTorch/PyTorch Tricks/trace_name.prof


In [15]:
# The profiler works for both CPU and CUDA models.
# For CUDA models, you have to run your python program with a special nvprof prefix. For example:

# nvprof --profile-from-start off -o trace_name.prof -- python <your arguments>
"""
# in python
with torch.cuda.profiler.profile():
    model(x) # Warmup CUDA memory allocator and profiler
    with torch.autograd.profiler.emit_nvtx():
        model(x)
"""
# Then, you can load trace_name.prof in PyTorch and print a summary profile report.

prof = torch.autograd.profiler.load_nvprof('trace_name.prof')
print(prof)

AssertionError: Expected time_us == 0 but got 129954

In [16]:
import torch
import torchvision.models as models
import torch.autograd.profiler as profiler

model = models.resnet18()
inputs = torch.randn(5, 3, 224, 224)
with profiler.profile(profile_memory=True, record_shapes=True) as prof:
    model(inputs)

# NOTE: some columns were removed for brevity
print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))
# ---------------------------  ---------------  ---------------  ---------------
# Name                         CPU Mem          Self CPU Mem     Number of Calls
# ---------------------------  ---------------  ---------------  ---------------
# empty                        94.79 Mb         94.79 Mb         123
# resize_                      11.48 Mb         11.48 Mb         2
# addmm                        19.53 Kb         19.53 Kb         1
# empty_strided                4 b              4 b              1
# conv2d                       47.37 Mb         0 b              20
# ---------------------------  ---------------  ---------------  ---------------

STAGE:2023-10-27 18:12:48 26895:26895 ActivityProfilerController.cpp:312] Completed Stage: Warm Up


---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                      aten::empty         0.46%       1.432ms         4.10%      12.882ms      64.410us      94.86 Mb      94.85 Mb           200  
    aten::max_pool2d_with_indices         4.91%      15.424ms         4.91%      15.424ms      15.424ms      11.48 Mb      11.48 Mb             1  
                      aten::addmm         2.91%       9.130ms         2.92%       9.172ms       9.172ms      19.53 Kb      19.53 Kb             1  
                       aten::mean         0.05%     172.000us         0.11%     340.000us     340.000us      10.

STAGE:2023-10-27 18:12:49 26895:26895 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2023-10-27 18:12:49 26895:26895 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


In [None]:
"""
Optimizers

    optim.SparseAdam: Implements a lazy version of Adam algorithm suitable for sparse tensors.
        In this variant, only moments that show up in the gradient get updated, and only those portions of the gradient get applied to the parameters.
    Optimizers now have an add_param_group function that lets you add new parameter groups to an already constructed optimizer.
"""

In [17]:
print(torch.cuda.get_device_name(0), torch.cuda.get_device_capability(0), torch.cuda.get_device_properties(0), sep='\n')

NVIDIA GeForce GTX 1650
(7, 5)
_CudaDeviceProperties(name='NVIDIA GeForce GTX 1650', major=7, minor=5, total_memory=3903MB, multi_processor_count=16)


In [None]:
"""
pad_packed_sequence now allows a padding_value argument that can be used instead of zero-padding

Dataset now has a + operator (which uses ConcatDataset). You can do something like MNIST(...) + FashionMNIST(...) 
for example, and you will get a concatenated dataset containing samples from both.

If you want to load a model's state_dict into another model (for example to fine-tune a pre-trained network), 
load_state_dict was strict on matching the key names of the parameters. Now we provide a strict=False option to 
load_state_dict where it only loads in parameters where the keys match, and ignores the other parameter keys.
"""

In [26]:
torch.version.cuda

'12.1'

In [None]:
# PyTorch 0.3.1 release

# Allow map_location in torch.load to be a string, such as map_location='cpu' or map_location='cuda:2' #4203
model = torch.load('model.pt', map_location='cpu')

In [33]:
x = torch.DoubleTensor([1, 1, 1])
print(type(x))
print(x.type())
print(isinstance(x, torch.DoubleTensor))

<class 'torch.Tensor'>
torch.DoubleTensor
True


In [34]:
x.requires_grad

False

In [35]:
x.requires_grad_()

tensor([1., 1., 1.], dtype=torch.float64, requires_grad=True)

In [None]:
"""
What about .data?

.data was the primary way to get the underlying Tensor from a Variable. After this merge, calling y = x.data still has similar semantics.
So y will be a Tensor that shares the same data with x, is unrelated with the computation history of x, and has requires_grad=False.

However, .data can be unsafe in some cases. Any changes on x.data wouldn't be tracked by autograd, and the computed gradients would be 
incorrect if x is needed in a backward pass. A safer alternative is to use x.detach(), which also returns a Tensor that shares data with 
requires_grad=False, but will have its in-place changes reported by autograd if x is needed in backward.
"""

In [36]:
torch.tensor(3.1416)         # create a scalar directly

tensor(3.1416)

In [37]:
torch.tensor(3.1416).size()  # scalar is 0-dimensional

torch.Size([])

In [38]:
torch.tensor([3]).size()     # compare to a vector of size 1

torch.Size([1])

In [39]:
vector = torch.arange(2, 6)  # this is a vector
vector

tensor([2, 3, 4, 5])

In [40]:
vector.size()

torch.Size([4])

In [41]:
vector[3]                    # indexing into a vector gives a scalar

tensor(5)

In [42]:
vector[3].item()             # .item() gives the value as a Python number

5

In [43]:
sum = torch.tensor([2, 3]).sum()
sum

tensor(5)

In [44]:
sum.size()

torch.Size([])

In [49]:
x = torch.zeros(1, requires_grad=True)
y = x * 2
y.requires_grad

True

In [50]:
x = torch.zeros(1, requires_grad=True)
with torch.no_grad():
    y = x * 2
y.requires_grad

False

In [51]:
x = torch.zeros(1, requires_grad=True)
with torch.inference_mode():
    y = x * 2
y.requires_grad

False

In [52]:
x = torch.zeros(1, requires_grad=True)
is_train = False
with torch.set_grad_enabled(is_train):
    y = x * 2
y.requires_grad

False

In [71]:
torch.get_autocast_gpu_dtype(), torch.get_autocast_cpu_dtype(), torch.get_default_dtype(), torch.get_num_threads(), torch.get_rng_state(), torch.get_num_interop_threads()

(torch.float16,
 torch.bfloat16,
 torch.float32,
 4,
 tensor([0, 0, 0,  ..., 0, 0, 0], dtype=torch.uint8),
 4)

In [None]:
"""
Below is a complete list of available torch.dtypes (data types) and their corresponding tensor types.
Data type 	                torch.dtype 	                Tensor types
32-bit floating point 	    torch.float32 or torch.float 	torch.*.FloatTensor
64-bit floating point 	    torch.float64 or torch.double 	torch.*.DoubleTensor
16-bit floating point 	    torch.float16 or torch.half 	torch.*.HalfTensor
8-bit integer (unsigned) 	torch.uint8 	                torch.*.ByteTensor
8-bit integer (signed) 	    torch.int8 	                    torch.*.CharTensor
16-bit integer (signed) 	torch.int16 or torch.short 	    torch.*.ShortTensor
32-bit integer (signed) 	torch.int32 or torch.int 	    torch.*.IntTensor
64-bit integer (signed) 	torch.int64 or torch.long 	    torch.*.LongTensor
"""

In [72]:
default_id = torch.cuda.current_device()

torch.device('{device_type}:{device_ordinal}')

0

In [73]:
x = torch.randn(3, dtype=torch.float64)
x.new_ones(2)

tensor([1., 1.], dtype=torch.float64)

In [74]:
x.new_ones(4, dtype=torch.int)

tensor([1, 1, 1, 1], dtype=torch.int32)

In [76]:
torch.full((2, 3), 3.1416)

tensor([[3.1416, 3.1416, 3.1416],
        [3.1416, 3.1416, 3.1416]])

In [79]:
torch.randperm(4)

tensor([2, 3, 0, 1])

In [None]:
"""
torch.empty 	    unintialized memory
torch.zeros 	    all zeros
torch.ones 	        all ones
torch.full 	        filled with a given value
torch.rand 	        i.i.d. continuous Uniform[0, 1)
torch.randn 	    i.i.d. Normal(0, 1)
torch.randint 	    i.i.d. discrete Uniform in given range
torch.randperm 	    random permutation of {0, 1, ..., n - 1}
torch.tensor 	    copied from existing data (list, NumPy ndarray, etc.)
torch.from_numpy* 	from NumPy ndarray (sharing storage without copying)
torch.arange,
torch.range         uniformly spaced values in a given range
torch.linspace
torch.logspace 	    logarithmically spaced values in a given range
torch.eye 	        identity matrix
"""

In [82]:
a = torch.rand(10, 10, 10, 10)

# the indexing elements can have other shapes than 1
b = a[[[3, 2]], :, [[1, 3]]]
b.shape

torch.Size([1, 2, 10, 10])

In [83]:
# broadcasting also supported in the indices, as well as lists,
# negative indices, slices, elipses, numbers
c = a[[1, -2], 2:4, :, [1]]
c.shape

torch.Size([2, 2, 10])

In [84]:
# can also support tensors as indices
index = torch.tensor([2, 4])
d = a[index]
d.shape

torch.Size([2, 10, 10, 10])

In [87]:
# and the indices can be on the GPU or CPU
e = a.cuda()[index.cuda()]
f = a.cuda()[index.cpu()]
e.shape, f.shape

(torch.Size([2, 10, 10, 10]), torch.Size([2, 10, 10, 10]))

In [88]:
g = a[index.cuda()]

RuntimeError: indices should be either on cpu or on the same device as the indexed tensor (cpu)

In [89]:
mask = torch.rand(10) > 0.5
# we can now index with a mask that has fewer
# dimensions than the indexing tensor
c = a[mask, :5]
c.shape

torch.Size([7, 5, 10, 10])

In [None]:
# Add torch.reshape, which is similar to numpy.reshape. It is roughly equivalent to tensor.contiguous().view(), 
# but avoids copying in certain cases #5575

In [90]:
a = torch.arange(0, 9).reshape(3, 3)
# the following transposes a
b = torch.einsum('ij->ji', (a,))
print(a, b, sep='\n')

tensor([[0, 1, 2],
        [3, 4, 5],
        [6, 7, 8]])
tensor([[0, 3, 6],
        [1, 4, 7],
        [2, 5, 8]])


In [91]:
# Add torch.expm1, a numerically stable exp(x)-1 for small x. #4350

In [92]:
# Add torch.where(condition, tensor1, tensor2) that returns a tensors of elements selected from tensor1 or tensor2 based on condition. #4259, #4259

In [None]:
# A new autograd container that lets you trade compute for memory

# The new checkpoint container allows you to only store a subset of the outputs necessary for backpropagation.
# If an output is missing (to save memory), the checkpoint container will recompute the intermediate outputs from 
# the closest checkpoint, so that memory usage can be reduced (with an increase in computation time).
# Here is an example:

# create the input tensors and set the requires_grad=True
# NOTE: the requires_grad=True for the input is a current
# limitation of checkpointing. At least one of the 
# model inputs should have requires_grad=True. 
# If you don't do it, you might have empty gradients.
input = torch.rand(1, 10, requires_grad=True)
layers = [nn.Linear(10, 10) for _ in range(1000)]

# define function that will define where
# we will checkpoint and store
# intermediate gradients. In this case,
# we will only store one intermediate
# gradient, in the middle of the
# model

def run_first_half(*args):
    x = args[0]
    for layer in layers[:500]:
        x = layer(x)
    return x

def run_second_half(*args):
    x = args[0]
    for layer in layers[500:-1]:
        x = layer(x)
    return x

# now uses the new checkpoint functionality
from torch.utils.checkpoint import checkpoint

x = checkpoint(run_first_half, input)
x = checkpoint(run_second_half, x)
# last output need to be run without checkpoint
x = layers[-1](x)
x.sum.backward()  # works!

# For sequential modules (which can have arbitrary blocks inside), a helper function checkpoint_sequential is provided, which takes care of the most common use-cases:

input = torch.rand(1, 10, requires_grad=True)
layers = [nn.Linear(10, 10) for _ in range(1000)]
model = nn.Sequential(*layers)

from torch.utils.checkpoint import checkpoint_sequential

# split in two blocks
num_segments = 2
x = checkpoint_sequential(model, num_segments, input)
x.sum().backward()  # works!

In [96]:
model = nn.Sequential(nn.Linear(2, 2), nn.ReLU(), nn.Linear(2, 2))
del model[1]  # deletes nn.ReLU
model

Sequential(
  (0): Linear(in_features=2, out_features=2, bias=True)
  (1): Linear(in_features=2, out_features=2, bias=True)
)

In [3]:
input = torch.randint(0, 8, (5,), dtype=torch.int64)
weights = torch.linspace(0, 1, steps=5)
input, weights

(tensor([4, 7, 5, 0, 3]), tensor([0.0000, 0.2500, 0.5000, 0.7500, 1.0000]))

In [4]:
torch.bincount(input)

tensor([1, 0, 0, 1, 1, 1, 0, 1])

In [5]:
input.bincount(weights)

tensor([0.7500, 0.0000, 0.0000, 1.0000, 0.0000, 0.5000, 0.0000, 0.2500])

In [None]:
# torch.as_tensor (similar to torch.tensor but never copies unless necessary) #7109

tensor = torch.randn(3, device='cpu', dtype=torch.float32)
torch.as_tensor(tensor)                       # doesn't copy
torch.as_tensor(tensor, dtype=torch.float64)  # copies due to incompatible dtype
torch.as_tensor(tensor, device='cuda')        # copies due to incompatible device
array = np.array([3, 4.5])
torch.as_tensor(array)                        # doesn't copy, sharing memory with the numpy array
torch.as_tensor(array, device='cuda')         # copies due to incompatible device

In [6]:
import os
os.environ['OMP_NUM_THREADS']='1'  #Use one CPU thread
import torch, torch.nn as nn, time
def test_net(net,offset):
    net.eval()
    total=0
    with torch.no_grad():
        for _ in range(100):
            x = torch.randn(100,100,100)+offset
            start_time = time.time()
            y = net(x)
            total+=time.time()-start_time
    print(net, total*10, 'ms')

for offset in [-1,0,+1]:
    test_net(nn.LeakyReLU(),offset) 
    test_net(nn.PReLU(),offset) 

LeakyReLU(negative_slope=0.01) 0.6847500801086426 ms
PReLU(num_parameters=1) 0.8758091926574707 ms
LeakyReLU(negative_slope=0.01) 0.6054043769836426 ms
PReLU(num_parameters=1) 0.8743023872375488 ms
LeakyReLU(negative_slope=0.01) 0.8593487739562988 ms
PReLU(num_parameters=1) 0.8649063110351562 ms


In [4]:
a = torch.tensor([1, 2, 3])
a.masked_select(torch.tensor([False,  True,  True]))

tensor([2, 3])

In [6]:
# PyTorch 1.3.0 release

x = torch.rand(10,1, dtype=torch.float32)
xq = torch.quantize_per_tensor(x, scale = 0.5, zero_point = 8, dtype=torch.quint8)
xq
# xq is a quantized tensor with data represented as quint8

tensor([[0.5000],
        [0.5000],
        [0.0000],
        [0.0000],
        [0.5000],
        [0.5000],
        [0.5000],
        [1.0000],
        [0.0000],
        [0.5000]], size=(10, 1), dtype=torch.quint8,
       quantization_scheme=torch.per_tensor_affine, scale=0.5, zero_point=8)

In [7]:
xdq = x.dequantize()
xdq
# convert back to floating point

tensor([[0.3489],
        [0.4017],
        [0.0223],
        [0.1689],
        [0.2939],
        [0.5185],
        [0.6977],
        [0.8000],
        [0.1610],
        [0.2823]])

In [8]:
# PyTorch 1.4.0 release

from torch.nn.utils import prune

t = torch.rand(2, 5)
t

tensor([[0.6816, 0.9152, 0.3971, 0.8742, 0.4194],
        [0.5529, 0.9527, 0.0362, 0.1852, 0.3734]])

In [9]:
p = prune.L1Unstructured(amount=0.7)
pruned_tensor = p.prune(t)
pruned_tensor

tensor([[0.0000, 0.9152, 0.0000, 0.8742, 0.0000],
        [0.0000, 0.9527, 0.0000, 0.0000, 0.0000]])

In [11]:
m = nn.Conv2d(3, 1, 2)
prune.ln_structured(module=m, name='weight', amount=0.5, n=2, dim=0)

Conv2d(3, 1, kernel_size=(2, 2), stride=(1, 1))

In [13]:
# LR Chaining

import torch
from torch.optim import SGD
from torch.optim.lr_scheduler import ExponentialLR, StepLR

model = [torch.nn.Parameter(torch.randn(2, 2, requires_grad=True))]
optimizer = SGD(model, 0.1)

scheduler1 = ExponentialLR(optimizer, gamma=0.9)
scheduler2 = StepLR(optimizer, step_size=3, gamma=0.1)
for epoch in range(5):
    print(epoch, scheduler2.get_last_lr()[0])
    optimizer.step()
    scheduler1.step()
    scheduler2.step()

0 0.1
1 0.09000000000000001
2 0.08100000000000002
3 0.007290000000000002
4 0.006561000000000002


In [None]:
m = MyMod()
torch.save(m.state_dict(), 'mymod.pt') # Saves a zipfile to mymod.pt

# To use the old format, pass the flag _use_new_zipfile_serialization=False

m = MyMod()
torch.save(m.state_dict(), 'mymod.pt', _use_new_zipfile_serialization=False) # Saves pickle

In [None]:
# PyTorch 1.7.0 release
"""
[Beta] torch.set_deterministic

Reproducibility (bit-for-bit determinism) may help identify errors when debugging or testing a program. To facilitate reproducibility, 
PyTorch 1.7 adds the torch.set_deterministic(bool) function that can direct PyTorch operators to select deterministic algorithms when available,
and to throw a runtime error if an operation may result in nondeterministic behavior. By default, the flag this function controls is false and 
there is no change in behavior, meaning PyTorch may implement its operations nondeterministically by default.

More precisely, when this flag is true:

    Operations known to not have a deterministic implementation throw a runtime error;
    Operations with deterministic variants use those variants (usually with a performance penalty versus the non-deterministic version); and
    torch.backends.cudnn.deterministic = True is set.

Note that this is necessary, but not sufficient, for determinism within a single run of a PyTorch program. Other sources of randomness like random 
number generators, unknown operations, or asynchronous or distributed computation may still cause nondeterministic behavior.

See the documentation for torch.set_deterministic(bool) for the list of affected operations.

    RFC | Link: https://github.com/pytorch/pytorch/issues/15359
    Documentation | Link: https://pytorch.org/docs/stable/generated/torch.set_deterministic.html

"""

In [6]:
torch.utils.collect_env()

AttributeError: module 'torch.utils' has no attribute 'collect_env'

In [3]:
torch.cuda.is_bf16_supported()

False

True