In [1]:
import math
import time
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim


In [5]:
x = torch.arange(12, dtype=torch.float32)
x

tensor([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11.])

In [6]:
x.numel()

12

In [7]:
# Demonstrate numel() with different shapes
print("1D tensor:")
a = torch.arange(12)
print(f"Shape: {a.shape}, numel(): {a.numel()}")

print("\n2D tensor:")
b = torch.arange(12).reshape(3, 4)
print(f"Shape: {b.shape}, numel(): {b.numel()}")

print("\n3D tensor:")
c = torch.arange(24).reshape(2, 3, 4)
print(f"Shape: {c.shape}, numel(): {c.numel()}")

print("\n4D tensor:")
d = torch.arange(24).reshape(1, 2, 3, 4)
print(f"Shape: {d.shape}, numel(): {d.numel()}")

# Show that numel() = product of shape dimensions
print(f"\nFor shape (2, 3, 4): 2 × 3 × 4 = {2*3*4}")
print(f"c.numel() = {c.numel()}")

1D tensor:
Shape: torch.Size([12]), numel(): 12

2D tensor:
Shape: torch.Size([3, 4]), numel(): 12

3D tensor:
Shape: torch.Size([2, 3, 4]), numel(): 24

4D tensor:
Shape: torch.Size([1, 2, 3, 4]), numel(): 24

For shape (2, 3, 4): 2 × 3 × 4 = 24
c.numel() = 24


In [9]:
a = torch.arange(120, dtype=torch.float32).reshape(2, 3, 4, 5)
print(a)
print(a.shape)
print(a[1][2][3])

tensor([[[[  0.,   1.,   2.,   3.,   4.],
          [  5.,   6.,   7.,   8.,   9.],
          [ 10.,  11.,  12.,  13.,  14.],
          [ 15.,  16.,  17.,  18.,  19.]],

         [[ 20.,  21.,  22.,  23.,  24.],
          [ 25.,  26.,  27.,  28.,  29.],
          [ 30.,  31.,  32.,  33.,  34.],
          [ 35.,  36.,  37.,  38.,  39.]],

         [[ 40.,  41.,  42.,  43.,  44.],
          [ 45.,  46.,  47.,  48.,  49.],
          [ 50.,  51.,  52.,  53.,  54.],
          [ 55.,  56.,  57.,  58.,  59.]]],


        [[[ 60.,  61.,  62.,  63.,  64.],
          [ 65.,  66.,  67.,  68.,  69.],
          [ 70.,  71.,  72.,  73.,  74.],
          [ 75.,  76.,  77.,  78.,  79.]],

         [[ 80.,  81.,  82.,  83.,  84.],
          [ 85.,  86.,  87.,  88.,  89.],
          [ 90.,  91.,  92.,  93.,  94.],
          [ 95.,  96.,  97.,  98.,  99.]],

         [[100., 101., 102., 103., 104.],
          [105., 106., 107., 108., 109.],
          [110., 111., 112., 113., 114.],
          [115., 116.,

In [10]:
# Demonstrate torch.cat() - concatenate tensors
print("torch.cat() concatenates tensors along a specified dimension")

# Create some example tensors
x = torch.tensor([[1, 2, 3], 
                  [4, 5, 6]])
y = torch.tensor([[7, 8, 9], 
                  [10, 11, 12]])

print(f"\nTensor x:\n{x}")
print(f"Shape: {x.shape}")

print(f"\nTensor y:\n{y}")
print(f"Shape: {y.shape}")

# Concatenate along dimension 0 (rows)
cat_dim0 = torch.cat([x, y], dim=0)
print(f"\ntorch.cat([x, y], dim=0) - concatenate along rows:\n{cat_dim0}")
print(f"Shape: {cat_dim0.shape}")

# Concatenate along dimension 1 (columns)  
cat_dim1 = torch.cat([x, y], dim=1)
print(f"\ntorch.cat([x, y], dim=1) - concatenate along columns:\n{cat_dim1}")
print(f"Shape: {cat_dim1.shape}")

# Can concatenate multiple tensors
z = torch.tensor([[13, 14, 15],
                  [16, 17, 18]])
cat_multiple = torch.cat([x, y, z], dim=0)
print(f"\ntorch.cat([x, y, z], dim=0) - concatenate 3 tensors:\n{cat_multiple}")
print(f"Shape: {cat_multiple.shape}")

torch.cat() concatenates tensors along a specified dimension

Tensor x:
tensor([[1, 2, 3],
        [4, 5, 6]])
Shape: torch.Size([2, 3])

Tensor y:
tensor([[ 7,  8,  9],
        [10, 11, 12]])
Shape: torch.Size([2, 3])

torch.cat([x, y], dim=0) - concatenate along rows:
tensor([[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9],
        [10, 11, 12]])
Shape: torch.Size([4, 3])

torch.cat([x, y], dim=1) - concatenate along columns:
tensor([[ 1,  2,  3,  7,  8,  9],
        [ 4,  5,  6, 10, 11, 12]])
Shape: torch.Size([2, 6])

torch.cat([x, y, z], dim=0) - concatenate 3 tensors:
tensor([[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9],
        [10, 11, 12],
        [13, 14, 15],
        [16, 17, 18]])
Shape: torch.Size([6, 3])


In [13]:
# Matrix multiplication in PyTorch
print("Matrix Multiplication in PyTorch")

# Create example matrices with different dimensions
A = torch.tensor([[1, 2, 3], 
                  [4, 5, 6]], dtype=torch.float32)  # 2x3 matrix
B = torch.tensor([[7, 8], 
                  [9, 10],
                  [11, 12]], dtype=torch.float32)    # 3x2 matrix

print(f"\nMatrix A:\n{A}")
print(f"Shape: {A.shape}")

print(f"\nMatrix B:\n{B}")
print(f"Shape: {B.shape}")

print("\n" + "="*50)

# Method 1: @ operator (recommended for matrix multiplication)
result1 = A @ B
print(f"Method 1 - A @ B (@ operator):\n{result1}")

# Method 2: torch.matmul() function
result2 = torch.matmul(A, B)
print(f"\nMethod 2 - torch.matmul(A, B):\n{result2}")

# Method 3: torch.mm() function (only for 2D matrices)
result3 = torch.mm(A, B)
print(f"\nMethod 3 - torch.mm(A, B):\n{result3}")

# Method 4: A.mm(B) method
result4 = A.mm(B)
print(f"\nMethod 4 - A.mm(B):\n{result4}")

print("\n" + "="*50)
print("All methods give the same result!")

# Element-wise multiplication (different from matrix multiplication)
# Note: A and B have different shapes, so element-wise multiplication won't work
print(f"\nElement-wise multiplication A * B would fail because shapes don't match:")
print(f"A shape: {A.shape}, B shape: {B.shape}")
print("Element-wise multiplication requires same shapes or broadcasting compatibility")

# Batch matrix multiplication example
print("\n" + "="*50)
print("Batch matrix multiplication:")
batch_A = torch.randn(3, 2, 4)  # 3 matrices of size 2x4
batch_B = torch.randn(3, 4, 5)  # 3 matrices of size 4x5
batch_result = batch_A @ batch_B  # Result: 3 matrices of size 2x5
print(f"batch_A shape: {batch_A.shape}")
print(f"batch_B shape: {batch_B.shape}")
print(f"batch_A @ batch_B shape: {batch_result.shape}")

Matrix Multiplication in PyTorch

Matrix A:
tensor([[1., 2., 3.],
        [4., 5., 6.]])
Shape: torch.Size([2, 3])

Matrix B:
tensor([[ 7.,  8.],
        [ 9., 10.],
        [11., 12.]])
Shape: torch.Size([3, 2])

Method 1 - A @ B (@ operator):
tensor([[ 58.,  64.],
        [139., 154.]])

Method 2 - torch.matmul(A, B):
tensor([[ 58.,  64.],
        [139., 154.]])

Method 3 - torch.mm(A, B):
tensor([[ 58.,  64.],
        [139., 154.]])

Method 4 - A.mm(B):
tensor([[ 58.,  64.],
        [139., 154.]])

All methods give the same result!

Element-wise multiplication A * B would fail because shapes don't match:
A shape: torch.Size([2, 3]), B shape: torch.Size([3, 2])
Element-wise multiplication requires same shapes or broadcasting compatibility

Batch matrix multiplication:
batch_A shape: torch.Size([3, 2, 4])
batch_B shape: torch.Size([3, 4, 5])
batch_A @ batch_B shape: torch.Size([3, 2, 5])


In [19]:
x = torch.arange(4.0)
x

tensor([0., 1., 2., 3.])

In [20]:
x.requires_grad_(True)
x.grad

In [22]:
y = 2 * torch.dot(x, x)
y

tensor(28., grad_fn=<MulBackward0>)

In [23]:
y.backward()
x.grad

tensor([ 0.,  4.,  8., 12.])

In [24]:
x.grad.zero_() # pyright: ignore[reportOptionalMemberAccess]
y = x * x
u = y.detach()
z = u * x

z.sum().backward()
x.grad == u

tensor([True, True, True, True])

In [26]:
# Understanding tensor.detach() - Breaking the computational graph
print("tensor.detach() explanation:")
print("="*60)

# Create a tensor that requires gradients
a = torch.tensor([2.0, 3.0], requires_grad=True)
print(f"Original tensor a: {a}")
print(f"a.requires_grad: {a.requires_grad}")

# Perform some operations
b = a * a
print(f"\nAfter b = a * a:")
print(f"b: {b}")
print(f"b.requires_grad: {b.requires_grad}")
print(f"b.grad_fn: {b.grad_fn}")  # Shows the computational graph

# Detach b from the computational graph
b_detached = b.detach()
print(f"\nAfter b_detached = b.detach():")
print(f"b_detached: {b_detached}")
print(f"b_detached.requires_grad: {b_detached.requires_grad}")
print(f"b_detached.grad_fn: {b_detached.grad_fn}")

print(f"\nKey points about detach():")
print("1. Returns a new tensor that shares the same data")
print("2. But is detached from the computational graph")
print("3. requires_grad=False for the detached tensor")
print("4. No gradients will flow through the detached tensor")

# Demonstrate gradient flow
print(f"\n" + "="*60)
print("Gradient flow demonstration:")

# Reset gradients
a.grad = None

# Case 1: Without detach - gradients flow through
c = b * 2  # b still connected to computational graph
loss1 = c.sum()
loss1.backward()
print(f"With normal tensor - a.grad: {a.grad}")

# Reset gradients
a.grad = None

# Case 2: With detach - no gradients flow through
d = b_detached * 2  # b_detached is disconnected from graph
loss2 = d.sum()
print(f"Trying to call backward on detached tensor...")
try:
    loss2.backward()
    print(f"With detached tensor - a.grad: {a.grad}")
except RuntimeError as e:
    print(f"ERROR: {e}")
    print("This is expected! Detached tensors can't backpropagate gradients")

print(f"\nCommon use cases for detach():")
print("- Stopping gradients at certain points")
print("- Creating targets for loss functions")
print("- Implementing techniques like target networks")
print("- Memory optimization in certain scenarios")

tensor.detach() explanation:
Original tensor a: tensor([2., 3.], requires_grad=True)
a.requires_grad: True

After b = a * a:
b: tensor([4., 9.], grad_fn=<MulBackward0>)
b.requires_grad: True
b.grad_fn: <MulBackward0 object at 0x113953d30>

After b_detached = b.detach():
b_detached: tensor([4., 9.])
b_detached.requires_grad: False
b_detached.grad_fn: None

Key points about detach():
1. Returns a new tensor that shares the same data
2. But is detached from the computational graph
3. requires_grad=False for the detached tensor
4. No gradients will flow through the detached tensor

Gradient flow demonstration:
With normal tensor - a.grad: tensor([ 8., 12.])
Trying to call backward on detached tensor...
ERROR: element 0 of tensors does not require grad and does not have a grad_fn
This is expected! Detached tensors can't backpropagate gradients

Common use cases for detach():
- Stopping gradients at certain points
- Creating targets for loss functions
- Implementing techniques like target ne