In [1]:
import torch
import numpy as np

TENSOR BASICS

In [2]:
#create empty tensor
x = torch.empty(2,2,3)
x

tensor([[[0., 0., 0.],
         [0., 0., 0.]],

        [[0., 0., 0.],
         [0., 0., 0.]]])

In [3]:
#create tensor with random values
x = torch.rand(2,2)
x

tensor([[0.4462, 0.1452],
        [0.4127, 0.6306]])

In [4]:
#create tensor with zeros
x = torch.zeros(2,3)
x

tensor([[0., 0., 0.],
        [0., 0., 0.]])

In [5]:
#change dtype
print(x.dtype)
x = torch.ones(2,2,dtype = torch.float16)
print(x.dtype)

torch.float32
torch.float16


In [6]:
#tensor from list
x = torch.tensor([[1,2],[3,4]],dtype = torch.int32)
x

tensor([[1, 2],
        [3, 4]], dtype=torch.int32)

BASIC OPERATIONS

In [7]:
x = torch.rand(2,2)
y = torch.rand(2,2)
print(x)
print(y)


tensor([[0.0632, 0.8962],
        [0.4276, 0.7508]])
tensor([[0.5067, 0.4281],
        [0.0997, 0.8197]])


In [8]:
#addition
z = x+y
z

tensor([[0.5699, 1.3243],
        [0.5273, 1.5704]])

In [9]:
#addition
z = torch.add(x,y)
z

tensor([[0.5699, 1.3243],
        [0.5273, 1.5704]])

In [10]:
#inplace addition
y.add_(x)   #any operation with a trailing _ to it will perform an inplace operation 
y

tensor([[0.5699, 1.3243],
        [0.5273, 1.5704]])

In [11]:
print(x)
print(y)

tensor([[0.0632, 0.8962],
        [0.4276, 0.7508]])
tensor([[0.5699, 1.3243],
        [0.5273, 1.5704]])


In [12]:
#elementwise multiplication
# z = x*y     #option 1
z = torch.mul(x,y)    #option 2
# y.mul_(x)      #option 3
z

tensor([[0.0360, 1.1869],
        [0.2255, 1.1790]])

In [13]:
#elementwise division
# z = x/y 
z = torch.div(x,y)
# x.div_(y)
z

tensor([[0.1109, 0.6768],
        [0.8110, 0.4781]])

In [14]:
#slicing operation

x = torch.rand(5,3)
print(x)

print(x[:,0]) #get all rows and one column
print(x[1,:]) #print only second rows and all columns


tensor([[0.8152, 0.5574, 0.2156],
        [0.7669, 0.3854, 0.6828],
        [0.0740, 0.4851, 0.4796],
        [0.7932, 0.4709, 0.8729],
        [0.5770, 0.8560, 0.0697]])
tensor([0.8152, 0.7669, 0.0740, 0.7932, 0.5770])
tensor([0.7669, 0.3854, 0.6828])


In [15]:
#using .item() function
#can only be used if there is only one element present in the tensor

print(x[1,1])
print(x[1][1])
print(x[1,1].item())

tensor(0.3854)
tensor(0.3854)
0.3854409456253052


In [16]:
#reshaping a tensor
x = torch.rand(2,8)
print(x)
y = x.view(16)
print(y)

tensor([[0.0475, 0.7178, 0.6123, 0.6944, 0.4010, 0.1343, 0.8571, 0.2022],
        [0.4790, 0.1329, 0.6101, 0.0310, 0.3820, 0.4195, 0.8741, 0.8850]])
tensor([0.0475, 0.7178, 0.6123, 0.6944, 0.4010, 0.1343, 0.8571, 0.2022, 0.4790,
        0.1329, 0.6101, 0.0310, 0.3820, 0.4195, 0.8741, 0.8850])


In [17]:
x = torch.rand(4,4)
print(x)

y = x.view(2,8)
print(y)

#use of -1 in view

y = x.view(-1,8)
print(y)

y = x.view(8,-1)
print(y)
print(y.size())  #use of size() function

tensor([[0.5373, 0.7256, 0.0620, 0.3998],
        [0.1673, 0.4472, 0.5377, 0.1149],
        [0.1610, 0.2202, 0.5549, 0.3996],
        [0.7833, 0.2033, 0.5134, 0.4185]])
tensor([[0.5373, 0.7256, 0.0620, 0.3998, 0.1673, 0.4472, 0.5377, 0.1149],
        [0.1610, 0.2202, 0.5549, 0.3996, 0.7833, 0.2033, 0.5134, 0.4185]])
tensor([[0.5373, 0.7256, 0.0620, 0.3998, 0.1673, 0.4472, 0.5377, 0.1149],
        [0.1610, 0.2202, 0.5549, 0.3996, 0.7833, 0.2033, 0.5134, 0.4185]])
tensor([[0.5373, 0.7256],
        [0.0620, 0.3998],
        [0.1673, 0.4472],
        [0.5377, 0.1149],
        [0.1610, 0.2202],
        [0.5549, 0.3996],
        [0.7833, 0.2033],
        [0.5134, 0.4185]])
torch.Size([8, 2])


In [18]:
#torch to numpy array
a = torch.ones(4)
print(a)
b = a.numpy()
print(b)
print(type(b))

tensor([1., 1., 1., 1.])
[1. 1. 1. 1.]
<class 'numpy.ndarray'>


In [19]:
#changing the tensor also changes the numpy array i.e. both share the same memory - only applicable on CPU
a.add_(1)
print(a)
print(b)


tensor([2., 2., 2., 2.])
[2. 2. 2. 2.]


In [20]:
#Requires Grad - Argument given to instruct that the gradients for the variable might be needed later
#by default, requres_grad is False
x = torch.ones(5, requires_grad=True)
print(x)

tensor([1., 1., 1., 1., 1.], requires_grad=True)


AUTOGRAD

Autograd: This class is an engine to calculate derivatives (Jacobian-vector product to be more precise). It records a graph of all the operations performed on a gradient enabled tensor and creates an acyclic graph called the dynamic computational graph. The leaves of this graph are input tensors and the roots are output tensors. Gradients are calculated by tracing the graph from the root to the leaf and multiplying every gradient in the way using the chain rule.


For each backpropagation iteration, several gradients are calculated and something called a computation graph is built for storing these gradient functions. PyTorch does it by building a Dynamic Computational Graph (DCG). This graph is built from scratch in every iteration providing maximum flexibility to gradient calculation. For example, for a forward operation (function)Mul a backward operation (function) called MulBackwardis dynamically integrated in the backward graph for computing the gradient.

--------------------------------------REFERENCES--------------------------------------------
                                                   
https://towardsdatascience.com/pytorch-autograd-understanding-the-heart-of-pytorchs-magic-2686cd94ec95
                                   
https://www.youtube.com/watch?v=hjnVLfvhN0Q&t=1334


In [21]:
x = torch.rand(3, requires_grad=True)
print(x)
y = x+2
print(y)
z = y*y*2
z = z.mean()
print(z)
z.backward()
x.grad

tensor([0.8152, 0.9284, 0.3738], requires_grad=True)
tensor([2.8152, 2.9284, 2.3738], grad_fn=<AddBackward0>)
tensor(14.7573, grad_fn=<MeanBackward0>)


tensor([3.7536, 3.9045, 3.1651])

In [22]:
gen = torch.Generator().manual_seed(1)
x = torch.rand(3,generator=gen,requires_grad=True)
print(x)
y = x+2
print(y)
z = y*y*2
z = z.mean()
print(z)
z.backward()
m = x.grad
m 

tensor([0.7576, 0.2793, 0.4031], requires_grad=True)
tensor([2.7576, 2.2793, 2.4031], grad_fn=<AddBackward0>)
tensor(12.3830, grad_fn=<MeanBackward0>)


tensor([3.6768, 3.0391, 3.2041])

In [23]:
gen = torch.Generator().manual_seed(1)
x = torch.rand(3,generator=gen,requires_grad=True)
print(x)
y = x+2
print(y)
z = y*y*2
z = z.sum()
print(z)
z.backward()
n = x.grad
n 

tensor([0.7576, 0.2793, 0.4031], requires_grad=True)
tensor([2.7576, 2.2793, 2.4031], grad_fn=<AddBackward0>)
tensor(37.1491, grad_fn=<SumBackward0>)


tensor([11.0305,  9.1172,  9.6123])

In [24]:
n/m

tensor([3.0000, 3.0000, 3.0000])

In [25]:
#calculating gradient by manually entering the v vector
gen = torch.Generator().manual_seed(1)
x = torch.rand(3,requires_grad=True,generator=gen)
print(x)
y = x+2
print(y)
z = y*y*2
print(z)
v = torch.tensor([1.,1.,1.])
z.backward(v) #entering the value of v vector to be multiplied by transpose of Jacobian matrix
print(x.grad)


tensor([0.7576, 0.2793, 0.4031], requires_grad=True)
tensor([2.7576, 2.2793, 2.4031], grad_fn=<AddBackward0>)
tensor([15.2091, 10.3905, 11.5495], grad_fn=<MulBackward0>)
tensor([11.0305,  9.1172,  9.6123])


In [26]:
#Prevent variable from tracking gradients
x = torch.rand(3,requires_grad=True)
print(x)

# x.requires_grad_(False)
# x.detach()
# with torch.no_grad():



tensor([0.7001, 0.6403, 0.3298], requires_grad=True)


In [27]:
x.requires_grad_(False)
print(x)

tensor([0.7001, 0.6403, 0.3298])


In [28]:
x = torch.rand(3,requires_grad=True)
print(x)
y = x.detach()
print(y)

tensor([0.0841, 0.7024, 0.2407], requires_grad=True)
tensor([0.0841, 0.7024, 0.2407])


In [29]:
x = torch.rand(3,requires_grad=True)
print(x)
y = x+2
print(y)
with torch.no_grad():
    y = x+2
    print(y)

tensor([0.4968, 0.2443, 0.4869], requires_grad=True)
tensor([2.4968, 2.2443, 2.4869], grad_fn=<AddBackward0>)
tensor([2.4968, 2.2443, 2.4869])


In [30]:
#Gradients are added up in backend
#They need to be cleared up using grad.zero_()
#An example:

x = torch.ones(4,requires_grad=True)
print(x)

for epoch in range(3):
    y = 3*x
    y = y.sum()
    y.backward()
    print(x.grad)


tensor([1., 1., 1., 1.], requires_grad=True)
tensor([3., 3., 3., 3.])
tensor([6., 6., 6., 6.])
tensor([9., 9., 9., 9.])


In [31]:
x = torch.ones(4,requires_grad=True)
print(x)

for epoch in range(3):
    y = x*3
    y = y.sum()
    y.backward()
    print(x.grad)
    x.grad.zero_() #flush gradients


tensor([1., 1., 1., 1.], requires_grad=True)
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])


BACKPROPAGATION

Concepts :
1. Chain Rule
2. Computational Graph

Steps:
1. Forward Pass : Compute Loss
2. Compute  Local gradients
3. Backward Pass: Compute dLoss/dWeights using the chain Rule

In [32]:
x = torch.tensor(1.)
y = torch.tensor(2.)

w = torch.tensor(1., requires_grad=True)

#Forward pass and calculate the loss
y_hat = w*x
loss = y_hat-y
loss = loss**2

print("loss",loss)

#backward pass
loss.backward()
grad = w.grad
print(w.grad)

loss tensor(1., grad_fn=<PowBackward0>)
tensor(-2.)


GRADIENT DESCENT

1. Using only Numpy

In [38]:
import numpy as np 

#f(x) = 2x

x = np.array([1.,2.,3.,4.],dtype = np.float32)
y = np.array([2.,4.,6.,8.],dtype = np.float32)

w = 0.0

#model prediction
def forward(x):
    return w*x

#loss = MSE
def loss(y, y_pred):
    return ((y_pred-y)**2).mean()

#gradient 
#MSE = 1/N * (w*x - y)**2
#dL/dw = 1/N 2x (w*x - y)
def gradient(x,y,y_pred):
    return np.dot(2*x,y_pred-y).mean()

print(f'Prediction before training : f(5) ={forward(5):.3f}')

#Training
learning_rate = 0.01
n_iters = 20

for epoch in range(n_iters):
    y_pred = forward(x)
    l = loss(y,y_pred)
    dw = gradient(x,y,y_pred)

    #update weights
    w-=learning_rate*dw
    if epoch%2 ==0:
        print(f'epoch {epoch+1}: w = {w:.3f}, loss ={l:.8f}')

print(f'Prediction after training : f(5) ={forward(5):.3f}')



Prediction before training : f(5) =0.000
epoch 1: w = 1.200, loss =30.00000000
epoch 3: w = 1.872, loss =0.76800019
epoch 5: w = 1.980, loss =0.01966083
epoch 7: w = 1.997, loss =0.00050332
epoch 9: w = 1.999, loss =0.00001288
epoch 11: w = 2.000, loss =0.00000033
epoch 13: w = 2.000, loss =0.00000001
epoch 15: w = 2.000, loss =0.00000000
epoch 17: w = 2.000, loss =0.00000000
epoch 19: w = 2.000, loss =0.00000000
Prediction after training : f(5) =10.000


2. Doing the same computations using pytorch

In [43]:
#f(x) = 2x

x = torch.tensor([1.,2.,3.,4.],dtype = torch.float32)
y = torch.tensor([2.,4.,6.,8.],dtype = torch.float32)

w = torch.tensor(0.0,requires_grad=True)

#model prediction
def forward(x):
    return w*x

#loss = MSE
def loss(y, y_pred):
    return ((y_pred-y)**2).mean()


print(f'Prediction before training : f(5) ={forward(5):.3f}')

#Training
learning_rate = 0.01
n_iters = 100

for epoch in range(n_iters):
    y_pred = forward(x)
    l = loss(y,y_pred)
    #gradients = backward pass
    l.backward()

    #update weights
    with torch.no_grad():
        w-=learning_rate*w.grad
    
    #empty/flush gradients
    w.grad.zero_()

    if epoch%10 ==0:
        print(f'epoch {epoch+1}: w = {w:.3f}, loss ={l:.8f}')

print(f'Prediction after training : f(5) ={forward(5):.3f}')


Prediction before training : f(5) =0.000
epoch 1: w = 0.300, loss =30.00000000
epoch 11: w = 1.665, loss =1.16278565
epoch 21: w = 1.934, loss =0.04506890
epoch 31: w = 1.987, loss =0.00174685
epoch 41: w = 1.997, loss =0.00006770
epoch 51: w = 1.999, loss =0.00000262
epoch 61: w = 2.000, loss =0.00000010
epoch 71: w = 2.000, loss =0.00000000
epoch 81: w = 2.000, loss =0.00000000
epoch 91: w = 2.000, loss =0.00000000
Prediction after training : f(5) =10.000
