## Manually  calculating gradient

In [3]:
## e.g if the input is y = x**2
def dy_dx(x):
  return 2*x

In [4]:
dy_dx(3)

6

## calculating gradient of more complex input

In [13]:
## e.g if the inputs are y = x**2 and z = sin(y)

import math
def  dz_dx(x):
  return 2* x * math.cos(x**2)

In [14]:
dz_dx(2)

-2.6145744834544478

### now more complexity in inputs

### e.g, y = x^2, z = sin(y),  u = e^z

 ## as the complexity grows taking the derivative of these nested function is impossible,
 ## that is where PyTorch's AutoGrad comes

### Now with torch autograd

In [17]:
import torch
x =  torch.tensor(3.0, requires_grad=True)  ## tracking the operations...(computation graph)

In [18]:
y = x**2

In [19]:
x   ### requires grad true

tensor(3., requires_grad=True)

In [20]:
y ## it looks back which operations are performed previously like x was raised power by 2 and we got y ...etc...etc

tensor(9., grad_fn=<PowBackward0>)

In [21]:
y.backward()  ## dy/dx  ,,, all the derivatives got calculated in the backward direction

In [22]:
x.grad ## this is derivative

tensor(6.)

### now again let's take the example of nested function like y = x^2, z = sin(y)

In [23]:
x = torch.tensor(3.0, requires_grad=True) ## this is x

In [24]:
y = x ** 2  ## this is y

In [25]:
z = torch.sin(y)  ## this is z

In [26]:
print(x)
print(y)
print(z)

## the computation graph will look like:  x --> (sq) --> y --> sin -- z ....forward direction

tensor(3., requires_grad=True)
tensor(9., grad_fn=<PowBackward0>)
tensor(0.4121, grad_fn=<SinBackward0>)


In [27]:
### now to calculate dz/dx  --- we have to go backward direction via chain rule

z.backward()   ## all the derivatives got calculated.. via chain-rule


In [28]:
x.grad  ## this is derivative we are looking for dz/dx

tensor(-5.4668)

In [29]:
### in PyTorch we can't calculate derivative of intermediate nodes ...(nodes in between) like:

y.grad

  y.grad


### now a simple neural network example (this is manual one..no pytorch use)

-- we are using scalar (only one input like 6.7)...no vector ...for  demo

In [30]:
## building the input schema for NN

import torch

## inputs
x = torch.tensor(6.7)  ## input feature
y = torch.tensor(0.0)  ## this is true label(binary classification)


w = torch.tensor(1.0)  ## weight
b = torch.tensor(0.0)  ## bias

In [31]:
## this is for calculating loss with binary_cross_entropy ....(remember: the input values are scaler...not vector etc)



def binary_cross_entropy(prediction, target):
  epsilon = 1e-8   ### to prevent log(0)
  prediction = torch.clamp(prediction,  epsilon, 1-epsilon) ##  to prevent exploding values during loss computation
  return -(target * torch.log(prediction) + (1 - target) * torch.log(1 - prediction))

In [32]:
## forward pass
z = w * x + b    ## weighted sum (linear part)
y_pred = torch.sigmoid(z)   ## predicted  probability


# compute binary_cross_entropy loss
loss = binary_cross_entropy(y_pred, y)

In [34]:
## now we have to calculate derivatives

# 1-   dL/d(y_pred): loss with respect to the prediction (y_pred)
dloss_dy_pred  = (y_pred - y)/(y_pred*(1-y_pred))

# 2-   dy_pred/dz: prediction (y_pred) with respect to z (sigmoid derivative)
dy_pred_dz = y_pred *  (1 - y_pred)

# 3.   dz/dw and dz/db: z with respect to w and b
dz_dw = x   # dz/dw = x
dz_db = 1   # dz/db = 1


dL_dw = dloss_dy_pred * dy_pred_dz * dz_dw   ## chain rule
dL_db = dloss_dy_pred * dy_pred_dz * dz_db   ## chain rule

In [35]:
print(f"manually calculated gradient of loss w.r.t weight (dw): {dL_dw}")
print(f"manually calculated gradient of loss w.r.t bias (db): {dL_db}")

manually calculated gradient of loss w.r.t weight (dw): 6.691762447357178
manually calculated gradient of loss w.r.t bias (db): 0.998770534992218


### now we going to use PyTorch AutoGrad ...(no manual way)

-- we are using scalar (only one input like 6.7)...no vector ...for demo

In [36]:
## building the input schema for NN

import torch

## inputs
x = torch.tensor(6.7)  ## input feature
y = torch.tensor(0.0)  ## this is true label(binary classification)


w = torch.tensor(1.0, requires_grad=True)  ## weight ....requires_grad = True ...cxs we have to calculate gradient with respect to w (weight)
b = torch.tensor(0.0, requires_grad=True)  ## bias   ....requires_grad = True ...cxs we have to calculate gradient with respect to b (bias)

In [39]:
z = w * x + b  ## weighted sum (linear part)
z

tensor(6.7000, grad_fn=<AddBackward0>)

In [40]:
y_pred = torch.sigmoid(z) ## passing z in sigmoid
y_pred

tensor(0.9988, grad_fn=<SigmoidBackward0>)

In [41]:
loss = binary_cross_entropy(y_pred, y) ## this is the loss
loss

tensor(6.7012, grad_fn=<NegBackward0>)

In [45]:
## forward pass --> the computation graph will look like:  w*x + b --> z --> sigmoid(z) --> y_pred and actual_y--> loss_function --> loss

## the backward pass will go calcuting the derivatives... like delta_Loss/delta_y_pred and till the delta_loss/delta_weight etc...

In [43]:
loss.backward() ## this will calculate all the derivates --chain-rule....w.r.t weight and bias both


In [44]:
print(w.grad)
print(b.grad)

tensor(6.6918)
tensor(0.9988)


### It's not we can only do it with single value(scaler), as our main work is with vectors...
-- now let's take the another demo example for vector

In [46]:
import torch

x  = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)


In [47]:
y = (x**2).mean()
y

tensor(4.6667, grad_fn=<MeanBackward0>)

In [49]:
y.backward()  ## this will calculate derivatives  ...partial differentiation...cxs multiple variables


In [50]:
x.grad  ## these are derivatives w.r.t x

tensor([0.6667, 1.3333, 2.0000])

## Clearing Gradients

-- Clearing gradients in PyTorch refers to resetting the gradients of your model's parameters to zero before computing the next backward pass.

In [56]:
## for example:

x = torch.tensor(2.0, requires_grad=True)  ## input
x

tensor(2., requires_grad=True)

In [57]:
y = x ** 2
y

tensor(4., grad_fn=<PowBackward0>)

In [58]:
y.backward() ## derivative

In [59]:
x.grad

tensor(4.)

In [60]:
## till now it's normal, but now if I again do the forward pass

y = x ** 2

In [61]:
y.backward()  ## again derivative

In [62]:
x.grad   ## the derivative is now 8.0 ... so the new value is added to the previous value...that is the problem

tensor(8.)

In [63]:
### running forward, backward again and again.. does not clear already calculated gradients

In [64]:
## in case of neural network...this is massive problem..cxs we do lot of
## forward and backward pass.. so adding gradient's like this ...will never give us exact gradient...(that is problem)

In [65]:
## the solution is When we are doing multiple passes we have to clear the previous gradient before another forward pass..

In [66]:
## so the change is ....
x.grad.zero_()

tensor(0.)

In [67]:
## if again I run the forward pass see the results...
y = x ** 2

In [68]:
y.backward()   ## calculating derivative this time ...no previous pass values will be aded

In [69]:
x.grad  ## this is new fresh gradient...

tensor(4.)

### How to disable gradient tracking...(that computation graph)

In [70]:
## option 1 --- requires_grad_(False)
## option 2 --- detach()
## option 3 --- torch.no_grad()

In [71]:
## requires_grad option

x = torch.tensor(3.0, requires_grad=False)
x   ## no requires_grad...

tensor(3.)

In [72]:
y = x ** 2

In [73]:
y    ## no grad function...(nothing in autograd computation graph)

tensor(9.)

In [74]:
y.backward()   ## so we can't compute any gradient

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [75]:
## detach option

x = torch.tensor(2.0, requires_grad=True)
x

tensor(2., requires_grad=True)

In [76]:
z = x.detach()  ## this z will contain same values...but with no gradient tracking...
z

tensor(2.)

In [77]:
## torch no_grad function option

x = torch.tensor(4.0, requires_grad=True)
x

tensor(4., requires_grad=True)

In [78]:
with torch.no_grad():      ## by using with keyword....and calling the function..to stop disable tracking
  y = x ** 2

In [79]:
y   ## no tracking

tensor(16.)

In [80]:
y.backward()     ### now it won't work

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

### End ..thanks for sticking till here....
