In [73]:
import torch
print(torch.__version__)

1.0.0


# Torch Tensors

## Initializing Torch Tensors

In [4]:
tensor1 = torch.tensor([[1,2,3],[4,5,6]])
print(tensor1)

tensor([[1, 2, 3],
        [4, 5, 6]])


Initializing tensors with other data-types and other devices (CPU or CUDA):

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
tensor1f = torch.tensor([[1,2,3],[4,5,6]],dtype=torch.float32, device="cuda") #cpu is default
print("Device is:",device)
print(tensor1f)

Device is: cuda
tensor([[1., 2., 3.],
        [4., 5., 6.]], device='cuda:0')


Initializing tensors for automatic differentiation,

To see what is auto-grad check https://arxiv.org/abs/1811.05031

In [6]:
tensor1fad = torch.tensor([[1,2,3],[4,5,6]],dtype=torch.float32, device="cuda",requires_grad = True) #cpu is default
print(tensor1f)
print(tensor1fad.dtype)
print(tensor1fad.device) ## zero is first gpu, if you have one gpu it is always zero.
print(tensor1fad.shape)
print(tensor1fad.requires_grad)

tensor([[1., 2., 3.],
        [4., 5., 6.]], device='cuda:0')
torch.float32
cuda:0
torch.Size([2, 3])
True


### Other Initialization Methods

In [7]:
tensor2 = torch.empty(size=(3,3)) #values will be whatever in the memory at that moment
print(tensor2)

tensor([[2.9528e+16, 3.0889e-41, 5.6052e-45],
        [0.0000e+00,        nan, 0.0000e+00],
        [1.1578e+27, 4.1666e+34, 5.3853e+08]])


In [8]:
tensor3 = torch.zeros((3,3))
print(tensor3)

tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])


In [9]:
tensor4 = torch.rand((3,3)) # from uniform distribution in the interval 0 and 1
print(tensor4)
print(tensor4.dtype)

tensor([[0.2400, 0.9275, 0.5815],
        [0.8824, 0.3535, 0.5638],
        [0.7510, 0.7501, 0.3892]])
torch.float32


In [10]:
tensor5 = torch.ones((2,2))
print(tensor5)

tensor([[1., 1.],
        [1., 1.]])


In [11]:
tensor6 = torch.eye(5,5) # initializes identity matrix
print(tensor6)

tensor([[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1.]])


In [12]:
tensor7 = torch.arange(start=0,end=5,step=1)
print(tensor7)

tensor([0, 1, 2, 3, 4])


In [13]:
tensor8 = torch.linspace(start=0.1, end=1, steps=10)
print(tensor8)

tensor([0.1000, 0.2000, 0.3000, 0.4000, 0.5000, 0.6000, 0.7000, 0.8000, 0.9000,
        1.0000])


In [14]:
tensor9 = torch.empty(size=(1,5)).normal_(mean = 0, std = 1)
print(tensor9)

tensor([[ 1.2215,  0.1040,  1.1097, -0.5321, -2.1323]])


In [15]:
tensor10 = torch.empty(size=(1,5)).uniform_(0,1)
print(tensor10)

tensor([[0.0810, 0.8847, 0.4055, 0.5601, 0.7550]])


In [16]:
tensor11 = torch.diag(tensor2)
print(tensor11)

tensor([2.9528e+16,        nan, 5.3853e+08])


### Data Type Operations

In [17]:
tensor12 = torch.arange(4)
print(tensor12)
print(tensor12.dtype)
print(tensor12.short()) # creates torch.int16
print(tensor12.long()) # creates torch.int64
print(tensor12.half()) # creates torch.float16
print(tensor12.float()) #creates torch.float32
print(tensor12.double()) #creates torch.float64

tensor([0, 1, 2, 3])
torch.int64
tensor([0, 1, 2, 3], dtype=torch.int16)
tensor([0, 1, 2, 3])
tensor([0., 1., 2., 3.], dtype=torch.float16)
tensor([0., 1., 2., 3.])
tensor([0., 1., 2., 3.], dtype=torch.float64)


### Array to Tensor Conversions

In [18]:
import numpy as np

In [19]:
arr1 = np.zeros((5,5))
tensor13 = torch.from_numpy(arr1) # numpy array to torch tensor
arr1_back = tensor13.numpy() # torch tensor to numpy array
# WARNING: there can be some numerical round-off errors

## Tensor Maths

In [20]:
tensor14 = torch.tensor([1,2,3])
tensor15 = torch.tensor([9,8,7])

Addition:

In [21]:
tensor16 = torch.add(tensor14, tensor15)
print(tensor16)
# or just tensor16 = tensor14 + tensor15

tensor([10, 10, 10])


Subtraction:

In [22]:
tensor17 = tensor14 - tensor15
print(tensor17)

tensor([-8, -6, -4])


Division:

In [23]:
tensor18 = tensor15 / tensor14
print(tensor18)

tensor([9, 4, 2])


Exponentations:

In [24]:
tensor20 = tensor18 ** 2
print(tensor20)

tensor([81, 16,  4])


Comparsions:

In [25]:
print(tensor14 > 0)
print(tensor20 >= 16)

tensor([1, 1, 1], dtype=torch.uint8)
tensor([1, 1, 0], dtype=torch.uint8)


Matrix Multiplications:


In [26]:
tensor21 = torch.rand((2,5))
tensor22 = torch.rand((5,4))
tensor23 = torch.mm(tensor21, tensor22) # 2 x 4
print(tensor23)

tensor([[0.9813, 0.9940, 1.6766, 0.7648],
        [1.5068, 1.2899, 2.4730, 1.8349]])


Element-wise Multiplication:

In [28]:
tensor24 = tensor14 * tensor15
print(tensor24)

tensor([ 9, 16, 21])


Dot Product:

In [30]:
tensor25 = torch.dot(tensor14,tensor15)
print(tensor25)

tensor(46)


Batch Matrix Multiplication:

In [33]:
batch = 32
n = 10
m = 20
p = 30
tensor26 = torch.rand((batch,n,m))
tensor27 = torch.rand((batch,m,p))
tensor28 = torch.bmm(tensor26,tensor27)
#shape = (batch, n, p)
print(tensor28)

tensor([[[4.4860, 4.1094, 5.8712,  ..., 4.8428, 5.3126, 5.4184],
         [5.2873, 4.3785, 6.2349,  ..., 6.2649, 5.9693, 5.9115],
         [4.6183, 4.4562, 5.1974,  ..., 5.3956, 4.7370, 4.7624],
         ...,
         [4.5693, 3.6962, 4.7715,  ..., 4.5770, 5.9074, 4.2569],
         [4.9908, 4.0985, 5.8262,  ..., 6.1090, 5.8616, 5.6223],
         [6.0819, 5.4564, 7.5043,  ..., 6.1924, 6.8475, 6.3794]],

        [[5.2088, 6.3306, 5.0007,  ..., 5.7079, 6.3852, 6.4534],
         [4.6208, 5.7491, 3.4608,  ..., 4.4189, 5.3284, 5.4780],
         [4.3648, 5.2081, 4.1212,  ..., 4.5857, 4.6831, 5.5319],
         ...,
         [5.2402, 6.8368, 6.1730,  ..., 5.7485, 6.3287, 7.0082],
         [4.6092, 5.2739, 4.0838,  ..., 4.5208, 5.0323, 5.6456],
         [4.8233, 5.7883, 4.5134,  ..., 5.2798, 5.4929, 5.7767]],

        [[5.9443, 6.0276, 6.4823,  ..., 6.2598, 5.2528, 5.9868],
         [5.1042, 4.7751, 5.1891,  ..., 4.4834, 4.2521, 5.5360],
         [6.5357, 6.4827, 6.0426,  ..., 5.9891, 5.5619, 5.

Broadcasting:

In [37]:
tensor29 = torch.rand((5,5))
tensor30 = torch.rand((1,5)) #is like each row is identical to each other
tensor31 = tensor29 - tensor30
tensor32 = tensor29 ** tensor30
print(tensor29)
print(tensor30)
print(tensor31)
print(tensor32)

tensor([[0.3146, 0.5835, 0.5399, 0.0835, 0.4572],
        [0.2126, 0.1363, 0.7588, 0.4865, 0.0232],
        [0.4498, 0.5166, 0.6925, 0.9647, 0.7111],
        [0.2460, 0.3827, 0.6107, 0.8897, 0.8874],
        [0.0862, 0.5934, 0.5778, 0.6017, 0.6110]])
tensor([[0.2452, 0.0699, 0.6109, 0.5753, 0.8402]])
tensor([[ 6.9374e-02,  5.1363e-01, -7.0942e-02, -4.9179e-01, -3.8300e-01],
        [-3.2614e-02,  6.6469e-02,  1.4791e-01, -8.8753e-02, -8.1697e-01],
        [ 2.0457e-01,  4.4674e-01,  8.1596e-02,  3.8937e-01, -1.2913e-01],
        [ 8.1909e-04,  3.1283e-01, -1.4323e-04,  3.1441e-01,  4.7253e-02],
        [-1.5903e-01,  5.2354e-01, -3.3112e-02,  2.6462e-02, -2.2919e-01]])
tensor([[0.7531, 0.9631, 0.6863, 0.2397, 0.5181],
        [0.6841, 0.8700, 0.8448, 0.6607, 0.0424],
        [0.8221, 0.9549, 0.7989, 0.9795, 0.7509],
        [0.7090, 0.9351, 0.7399, 0.9350, 0.9045],
        [0.5482, 0.9642, 0.7152, 0.7466, 0.6611]])


#### Other Type of Operations on Tensors:

In [41]:
tensor33 = torch.rand((4,4))
tensor34 = torch.sum(tensor33,dim=0)
tensor35 = torch.sum(tensor33,dim=1)
print(tensor33)
print(tensor34)
print(tensor35)

tensor([[0.1288, 0.1928, 0.3714, 0.1745],
        [0.9756, 0.4671, 0.6382, 0.4853],
        [0.6258, 0.3075, 0.1725, 0.4017],
        [0.1607, 0.6643, 0.5309, 0.3876]])
tensor([1.8909, 1.6316, 1.7130, 1.4490])
tensor([0.8676, 2.5661, 1.5074, 1.7434])


In [43]:
values1, indices1 = torch.max(tensor33,dim=0) #tensor33.max(dim=0)
values2, indices2 = torch.max(tensor33,dim=1)
print(tensor33)
print("Values1, indices1 (dim0: columns): {} and {}".format(values1,indices1))
print("Values2, indices2 (dim1: rows): {} and {}".format(values2,indices2))


tensor([[0.1288, 0.1928, 0.3714, 0.1745],
        [0.9756, 0.4671, 0.6382, 0.4853],
        [0.6258, 0.3075, 0.1725, 0.4017],
        [0.1607, 0.6643, 0.5309, 0.3876]])
Values1, indices1 (dim0: columns): tensor([0.9756, 0.6643, 0.6382, 0.4853]) and tensor([1, 3, 1, 1])
Values2, indices2 (dim1: rows): tensor([0.3714, 0.9756, 0.6258, 0.6643]) and tensor([2, 0, 0, 1])


In [45]:
tensor36 = tensor33 - 0.50
print(tensor36)
tensor37 = torch.abs(tensor36)
print(tensor37)

tensor([[-0.3712, -0.3072, -0.1286, -0.3255],
        [ 0.4756, -0.0329,  0.1382, -0.0147],
        [ 0.1258, -0.1925, -0.3275, -0.0983],
        [-0.3393,  0.1643,  0.0309, -0.1124]])
tensor([[0.3712, 0.3072, 0.1286, 0.3255],
        [0.4756, 0.0329, 0.1382, 0.0147],
        [0.1258, 0.1925, 0.3275, 0.0983],
        [0.3393, 0.1643, 0.0309, 0.1124]])


In [48]:
tensor38 = torch.argmax(tensor33,dim=0) #actually it returns torch.max's second return argument
tensor39 = torch.argmin(tensor33,dim=1)
print(tensor38)
print(tensor39)

tensor([1, 3, 1, 1])
tensor([0, 1, 2, 0])


In [50]:
tensor40 = torch.mean(tensor33.float(),dim=0) #input has to be float
print(tensor40)

tensor([0.4727, 0.4079, 0.4283, 0.3623])


In [54]:
sorted_tensor33, indices3 = torch.sort(tensor33, dim=0, descending=False)
print(sorted_tensor33)

tensor([[0.1288, 0.1928, 0.1725, 0.1745],
        [0.1607, 0.3075, 0.3714, 0.3876],
        [0.6258, 0.4671, 0.5309, 0.4017],
        [0.9756, 0.6643, 0.6382, 0.4853]])


In [58]:
tensor41 = torch.clamp(tensor33, min=10)
#check all elements of tensor33 and if it is less than 10, gonna set to zero
print(tensor41)

tensor([[10., 10., 10., 10.],
        [10., 10., 10., 10.],
        [10., 10., 10., 10.],
        [10., 10., 10., 10.]])


## Tensor Indexing

In [62]:
batch_size = 10
features = 25
tensor42 = torch.rand((batch_size,features))
print(tensor42[0,:].shape)
print(tensor42[:,0].shape)
print(tensor42[2,0:10])

torch.Size([25])
torch.Size([10])
tensor([0.0836, 0.4753, 0.7446, 0.5352, 0.5841, 0.4024, 0.3516, 0.3523, 0.5565,
        0.8405])


Fancy Indexing:

In [64]:
tensor43 = torch.arange(10)
print(tensor43)
indices = [1,3,7]
print(tensor43[indices])

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
tensor([1, 3, 7])


In [65]:
print(tensor43[(tensor43 <2) | (tensor43 > 8)])

tensor([0, 1, 9])


In [67]:
print(tensor43[tensor43.remainder(2) == 0])

tensor([0, 2, 4, 6, 8])


#### Other Operations

In [72]:
print(tensor43.where(tensor43 > 5, tensor43**2))

tensor([ 0,  1,  4,  9, 16, 25,  6,  7,  8,  9])


In [75]:
print(torch.tensor([0,0,1,2,2,3,4,4]).unique())
#sorted

tensor([4, 3, 2, 1, 0])


In [77]:
print(tensor43.ndimension()) #number of dimensions

1


In [79]:
print(tensor43.numel()) #number of elements in tensor
#it is easier to get number of elements in lower dimensions
#but this function is useful in higher dimensions

10


## Tensor Reshaping

In [85]:
tensor44 = torch.arange(9)
tensor44_3x3 = tensor44.view(3,3)
tensor44_3x3_1 = tensor44.reshape(3,3)
print(tensor44)
print(tensor44_3x3)
print(tensor44_3x3_1)
#reshape tries to return a view if possible, 
#otherwise copies to data to a contiguous tensor and returns the view on it. 
# view -> contiguous
# reshape -> doesnt really matters (safe one but performance loss)

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8])
tensor([[0, 1, 2],
        [3, 4, 5],
        [6, 7, 8]])
tensor([[0, 1, 2],
        [3, 4, 5],
        [6, 7, 8]])


In [94]:
tensor45 = tensor44_3x3_1.t() #transpose
print(tensor45)
print(tensor45.view(9)) # should be [0,3,6,1,4,7,2,5,8]
#jumping steps in memory
#now the transpose version is not a contiguous blocks of memory.
#look at the error:
#how to fiX:
print(tensor45.contiguous().view(9))

tensor([[0, 3, 6],
        [1, 4, 7],
        [2, 5, 8]])


RuntimeError: invalid argument 2: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Call .contiguous() before .view(). at /opt/conda/conda-bld/pytorch_1544081038057/work/aten/src/TH/generic/THTensor.cpp:213

In [96]:
tensor46 = torch.rand((5,5))
tensor47 = torch.rand((5,5))
tensor48 = torch.cat((tensor46,tensor47), dim=0)
print(tensor48)

tensor([[0.9023, 0.0174, 0.0234, 0.2747, 0.4161],
        [0.1898, 0.9965, 0.0763, 0.9456, 0.1874],
        [0.3045, 0.6030, 0.2394, 0.0557, 0.5099],
        [0.8243, 0.5855, 0.9251, 0.7641, 0.8823],
        [0.9357, 0.6500, 0.9756, 0.7506, 0.3703],
        [0.1949, 0.7359, 0.3033, 0.6607, 0.2303],
        [0.7564, 0.8474, 0.3711, 0.1613, 0.1588],
        [0.8532, 0.7324, 0.1657, 0.2771, 0.1029],
        [0.1713, 0.6983, 0.2314, 0.8304, 0.7855],
        [0.1971, 0.5955, 0.1155, 0.3583, 0.9472]])


In [98]:
tensor49 = tensor48.view(-1)
print(tensor49)

tensor([0.9023, 0.0174, 0.0234, 0.2747, 0.4161, 0.1898, 0.9965, 0.0763, 0.9456,
        0.1874, 0.3045, 0.6030, 0.2394, 0.0557, 0.5099, 0.8243, 0.5855, 0.9251,
        0.7641, 0.8823, 0.9357, 0.6500, 0.9756, 0.7506, 0.3703, 0.1949, 0.7359,
        0.3033, 0.6607, 0.2303, 0.7564, 0.8474, 0.3711, 0.1613, 0.1588, 0.8532,
        0.7324, 0.1657, 0.2771, 0.1029, 0.1713, 0.6983, 0.2314, 0.8304, 0.7855,
        0.1971, 0.5955, 0.1155, 0.3583, 0.9472])


In [105]:
batch = 16
tensor50 = torch.rand((batch,2,3))
tensor51 = tensor50.view(batch,-1)
tensor52 = tensor50.permute(0,2,1)
print(tensor51)
print(tensor52)
print(tensor50.shape)
print(tensor51.shape)
print(tensor52.shape)

tensor([[0.1183, 0.4753, 0.8155, 0.9203, 0.0240, 0.9939],
        [0.0082, 0.6212, 0.9916, 0.8313, 0.6680, 0.2797],
        [0.5901, 0.1485, 0.6576, 0.9793, 0.9190, 0.5203],
        [0.3311, 0.3065, 0.2698, 0.3262, 0.6138, 0.0495],
        [0.6868, 0.3269, 0.3556, 0.0920, 0.3055, 0.6259],
        [0.1605, 0.5091, 0.5690, 0.3127, 0.6657, 0.3449],
        [0.4915, 0.8223, 0.1271, 0.3587, 0.4400, 0.7278],
        [0.2048, 0.8042, 0.4990, 0.3775, 0.6469, 0.5525],
        [0.5193, 0.3407, 0.5706, 0.0144, 0.2177, 0.1730],
        [0.4841, 0.9914, 0.2462, 0.8088, 0.2293, 0.4570],
        [0.9818, 0.9922, 0.4088, 0.7719, 0.8278, 0.3224],
        [0.4523, 0.8083, 0.4713, 0.7553, 0.3349, 0.2724],
        [0.4558, 0.4578, 0.2966, 0.2713, 0.9666, 0.2902],
        [0.7687, 0.6933, 0.2296, 0.0957, 0.2220, 0.9356],
        [0.4461, 0.9775, 0.3541, 0.4098, 0.5456, 0.6858],
        [0.1335, 0.2065, 0.6645, 0.4544, 0.4699, 0.1660]])
tensor([[[0.1183, 0.9203],
         [0.4753, 0.0240],
         [0.8155,

In [108]:
tensor53 = torch.arange(10)
print(tensor53.unsqueeze(0))
print(tensor53.unsqueeze(0).shape)
print(tensor53.unsqueeze(1))
print(tensor53.unsqueeze(1).shape)

tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])
torch.Size([1, 10])
tensor([[0],
        [1],
        [2],
        [3],
        [4],
        [5],
        [6],
        [7],
        [8],
        [9]])
torch.Size([10, 1])


In [112]:
tensor54 = torch.arange(10).unsqueeze(0).unsqueeze(1)
tensor55 = tensor54.squeeze(1).squeeze(0)
print(tensor54)
print(tensor54.shape)
print(tensor55)
print(tensor55.shape)

tensor([[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]])
torch.Size([1, 1, 10])
tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
torch.Size([10])


## PyTorch's Automatic Differentation

https://openreview.net/pdf?id=BJJsrmfCZ

Automatic Differentiation is a building block of not only PyTorch, but every DL library out there. In my opinion, PyTorch's automatic differentiation engine, called Autograd is a brilliant tool to understand how automatic differentiation works. This will not only help you understand PyTorch better, but also other DL libraries.
Modern neural network architectures can have millions of learnable parameters. From a computational point of view, training a neural network consists of two phases:

- A forward pass to compute the value of the loss function.
- A backward pass to compute the gradients of the learnable parameters.

Very simple computational graph:

<img src="https://blog.paperspace.com/content/images/2019/03/computation_graph_forward.png" alt="Drawing" style="width: 500px;"/>



And its forward equations:

$b = w1 * a $

$c = w2 * a$

$d = w3*b + w4*c$

$L=10-d$

and backward equations:

$$\frac{\partial{L}}{\partial{w_4}} = \frac{\partial{L}}{\partial{d}} * \frac{\partial{d}}{\partial{w_4}}$$

$$\frac{\partial{L}}{\partial{w_3}} = \frac{\partial{L}}{\partial{d}} * \frac{\partial{d}}{\partial{w_3}}$$

$$\frac{\partial{L}}{\partial{w_2}} = \frac{\partial{L}}{\partial{d}} * \frac{\partial{d}}{\partial{c}} * \frac{\partial{c}}{\partial{w_2}}$$

$$\frac{\partial{L}}{\partial{w_1}} = \frac{\partial{L}}{\partial{d}} * \frac{\partial{d}}{\partial{b}} * \frac{\partial{b}}{\partial{w_1}}$$

So the computational graph for forward equations must be:
    
<img src="https://blog.paperspace.com/content/images/2019/03/computation_graph.png" alt="Drawing" style="width: 400px;"/>

So the computational graph for backward equations must be:
    
<img src="https://blog.paperspace.com/content/images/2019/03/full_graph.png" alt="Drawing" style="width: 400px;"/>

If you see, the product is precisely the same expression we derived using chain rule. If there is more than one path to a variable from L then, we multiply the edges along each path and then add them together. For example, $\frac{\partial L}{\partial a}$ is computed as:

$$\frac{\partial{L}}{\partial{a}} = \frac{\partial{L}}{\partial{d}}*\frac{\partial{d}}{\partial{b}}*\frac{\partial{b}}{\partial{a}} + \frac{\partial{L}}{\partial{d}}*\frac{\partial{d}}{\partial{c}}*\frac{\partial{c}}{\partial{a}}$$

Now, lets code:

In [137]:
a = torch.randn((3,3), requires_grad = True)
w1 = torch.randn((3,3), requires_grad = True)
print(w1)
w2 = torch.randn((3,3), requires_grad = True)
w3 = torch.randn((3,3), requires_grad = True)
w4 = torch.randn((3,3), requires_grad = True)

b = w1*a 
c = w2*a
d = w3*b + w4*c 
L = 10 - d

print("The grad fn for a is: ", a.grad_fn)
print("The grad fn for d is: ", d.grad_fn)

tensor([[-0.1491, -0.7433, -1.4484],
        [ 0.3656,  0.8438,  0.4780],
        [ 2.0702,  0.8685, -0.2250]], requires_grad=True)
The grad fn for a is:  None
The grad fn for d is:  <AddBackward0 object at 0x7f17e6b572b0>


In [129]:
## L.backward() will give error
#This is because gradients can be computed with respect to scalar values by definition.
#You can't exactly differentiate a vector with respect to another vector. 
#The mathematical entity used for such cases is called a Jacobian.

In [138]:
learning_rate = 0.01
L = (10-d).sum()
L.backward()
w1 = w1 - learning_rate * w1.grad

In [139]:
print(w1)

tensor([[-0.1555, -0.7489, -1.4439],
        [ 0.3755,  0.8516,  0.4825],
        [ 2.0660,  0.8642, -0.2364]], grad_fn=<SubBackward0>)


Very good resoruce and reference for me: https://blog.paperspace.com/pytorch-101-understanding-graphs-and-automatic-differentiation/
check for more!