In [1]:
!pip install numpy torch torchvision torchaudio



In [2]:
import torch
import numpy as np

In [3]:
t1 = torch.tensor(4.0)
t1

tensor(4.)

In [4]:
t1.dtype

torch.float32

In [5]:
t1.ndim

0

In [6]:
t1.item()

4.0

In [7]:
int_t1 = t1.type(torch.int32)

In [8]:
int_t1

tensor(4, dtype=torch.int32)

In [9]:
float_t1 = int_t1.type(torch.float32)

In [10]:
float_t1.dtype

torch.float32

In [11]:
t2 = torch.tensor([1,2,3,4,5])

In [12]:
t1.dtype

torch.float32

In [13]:
t2.dtype

torch.int64

In [14]:
t2

tensor([1, 2, 3, 4, 5])

In [15]:
t2_float = t2.type(torch.float32)

In [16]:
t2.dtype

torch.int64

In [17]:
t2_float.dtype

torch.float32

In [18]:
t2

tensor([1, 2, 3, 4, 5])

In [19]:
t2_float

tensor([1., 2., 3., 4., 5.])

In [20]:
#3 dimensional tensors

In [21]:
t3 = torch.tensor([[[1,2,3,4],
                  [4,5,6,7]],
                 [[1,2,3,4],
                 [4,5,6,7]]])

In [22]:
t3


tensor([[[1, 2, 3, 4],
         [4, 5, 6, 7]],

        [[1, 2, 3, 4],
         [4, 5, 6, 7]]])

In [23]:
t3.dtype

torch.int64

In [24]:
t3.ndim

3

In [25]:
t3.item

<function Tensor.item>

In [26]:
t3.shape

torch.Size([2, 2, 4])

In [27]:
##Tensor operations and gradients
#We can combine tensors with the usual arithmetic operations


In [28]:
x = torch.tensor(3.)                      #not interested in future derivative with respect to x, esma chai requires_grad bhanne na rakheko cause we are not interested in the derivate of this value
w = torch.tensor(4., requires_grad = True)      #for backward propagation or for computing the derivatives
b = torch.tensor(5., requires_grad = True)


In [29]:
x, w, b

(tensor(3.), tensor(4., requires_grad=True), tensor(5., requires_grad=True))

In [30]:
#Lets create a tensor y by combining the tensors we just created

In [31]:
y = w * x + b

In [32]:
y

tensor(17., grad_fn=<AddBackward0>)

In [33]:
#computing derivates
y.backward()

The derivatives of y with respect to the input tensors are stored in the .grad property of the respective tensors

In [34]:
#Display gradients
print('dy/dx:', x.grad)
print('dy/dw:', w.grad)
print('dy/db:', b.grad)


dy/dx: None
dy/dw: tensor(3.)
dy/db: tensor(1.)


The "grad" in w.grad is short for gradient, which is another term for derivative. The term gradient is primarily used while dealing with vectors and matrices

In [35]:
#Creating a tensor with a fixed value for every element

t5 = torch.full((3,4), 10)
t5

tensor([[10, 10, 10, 10],
        [10, 10, 10, 10],
        [10, 10, 10, 10]])

In [36]:
#Concatenating two tensors with compatible shapes
t6 = torch.full((3,4), 2)
t7 = torch.cat((t5, t6), dim =0)

In [37]:
t7

tensor([[10, 10, 10, 10],
        [10, 10, 10, 10],
        [10, 10, 10, 10],
        [ 2,  2,  2,  2],
        [ 2,  2,  2,  2],
        [ 2,  2,  2,  2]])

In [38]:
t7_col = torch.cat((t5, t6), dim =1)

In [39]:
t7_col

tensor([[10, 10, 10, 10,  2,  2,  2,  2],
        [10, 10, 10, 10,  2,  2,  2,  2],
        [10, 10, 10, 10,  2,  2,  2,  2]])

In [40]:
#Computing the sin of each element

t8 = torch.sin(t7)
t8

tensor([[-0.5440, -0.5440, -0.5440, -0.5440],
        [-0.5440, -0.5440, -0.5440, -0.5440],
        [-0.5440, -0.5440, -0.5440, -0.5440],
        [ 0.9093,  0.9093,  0.9093,  0.9093],
        [ 0.9093,  0.9093,  0.9093,  0.9093],
        [ 0.9093,  0.9093,  0.9093,  0.9093]])

In [41]:
#Changing the shape of a tensor

t8.shape

torch.Size([6, 4])

In [42]:
t9 = t8.reshape(3,2,4)
t9

tensor([[[-0.5440, -0.5440, -0.5440, -0.5440],
         [-0.5440, -0.5440, -0.5440, -0.5440]],

        [[-0.5440, -0.5440, -0.5440, -0.5440],
         [ 0.9093,  0.9093,  0.9093,  0.9093]],

        [[ 0.9093,  0.9093,  0.9093,  0.9093],
         [ 0.9093,  0.9093,  0.9093,  0.9093]]])

In [43]:
#Interoperability with Numpy

x = np.array([[1,2],
             [3,4]])
x

array([[1, 2],
       [3, 4]])

In [44]:
#Changing into tensor

tx = torch.from_numpy(x)

In [45]:
tx

tensor([[1, 2],
        [3, 4]])

In [46]:
x.dtype, tx.dtype

(dtype('int64'), torch.int64)

In [47]:
#Changing back to numpy

z = tx.numpy()
z

array([[1, 2],
       [3, 4]])

Why use Pytorch when we have preexisting Numpy which already provides data structures and utlities for working with multi-dimensional numeric data?
Two main reasons:
1 - Autograd: The ability to automatically compute gradients for tensor operations is essential for training deep learning models.
2 - GPU support: While working with massive datasets and large models, Pytorch tensor operations can be performed efficintly using GPU that might typically take hours can be completed within minutes using GPU.


# Gradient Descent and Linear Regression with PyTorch


In [48]:
#As an example we would look at a chart of crop yields which contains input variables like temp, rainfall, and humidity in correspond to the target variable of apple and oranges produced

In [49]:
#Input (temp, rainfall, humidity)

inputs = np.array([[73,67,43],
                  [91,88,64],
                  [87,134,58],
                  [102, 43, 37],
                  [69,96,70]], dtype = 'float32')

In [50]:
#Targets(apples, oranges)

targets = np.array([[56,70],
                   [81,101],
                   [119,133],
                   [22,37],
                   [103,119]], dtype = 'float32')

In [51]:
inputs, targets

(array([[ 73.,  67.,  43.],
        [ 91.,  88.,  64.],
        [ 87., 134.,  58.],
        [102.,  43.,  37.],
        [ 69.,  96.,  70.]], dtype=float32),
 array([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]], dtype=float32))

In [52]:
#Converting inputs ans targets into tensors

inputs = torch.from_numpy(inputs)
targets = torch.from_numpy(targets)
print(inputs)
print(targets)

tensor([[ 73.,  67.,  43.],
        [ 91.,  88.,  64.],
        [ 87., 134.,  58.],
        [102.,  43.,  37.],
        [ 69.,  96.,  70.]])
tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]])


# Linear Regression from the Scratch


In a linear regressoion model, each target variable is estimated to be a weighted sum of the input variables, offset by some constant, known as a bias:

     yield_apple = w11 * temp + w12 * rainfall + w13 * humidity + b1
     yield_orange = w21 * temp + w22 * rainfall + w23 * humidity + b2

The weights and biases(w11, w12, ..., b1 and b2) can also be represented as matrices, initialized as random variables.
The first row of w and the first element of b are used to predict the first target variables, i.e., yield of apples, ans similarly, the second for oranges.


In [53]:
#Weights and biases

w = torch.randn(2,3, requires_grad = True)    #torch.randn creates a tensor with the given shape, with elements picked randomly from a normal distribution with mean 0 and standard deviation 1.
b = torch.randn(2, requires_grad = True)
print(w)
print(b)

tensor([[-0.9381,  0.3945,  2.7540],
        [ 1.4411,  0.8046,  0.5659]], requires_grad=True)
tensor([ 1.9935, -1.0061], requires_grad=True)


In [54]:
inputs

tensor([[ 73.,  67.,  43.],
        [ 91.,  88.,  64.],
        [ 87., 134.,  58.],
        [102.,  43.,  37.],
        [ 69.,  96.,  70.]])

In [55]:
inputs @ w.t() + b

tensor([[ 78.3711, 182.4325],
        [127.6053, 237.1517],
        [132.9815, 265.0025],
        [ 25.1749, 201.5186],
        [167.9227, 215.2800]], grad_fn=<AddBackward0>)

In [56]:
#OR using the function

def model(x):
    return x @ w.t() + b

In [57]:
#generate predictions

preds = model(inputs)
print(preds)

tensor([[ 78.3711, 182.4325],
        [127.6053, 237.1517],
        [132.9815, 265.0025],
        [ 25.1749, 201.5186],
        [167.9227, 215.2800]], grad_fn=<AddBackward0>)


In [58]:
#comparing the predictions of our models with the actual targets

In [59]:
print(targets)

tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]])


In [60]:
#we can see there is a big difference between our model's predictions and the actual targets because we've initialized our model with random weights and biases.

# Loss Function

Before we improve our model, we need a way to evaluate how well our model is performing. We can compare the model's
predictions with the actual targets using the following method:
    1- Calculate the diff between two matrices (preds and targets)
    2- Square all elements of the diff matrix to remove negative values
    3- Calculate the average of the elements in the resulting matrix
The result is a single number, known as the mean squared error(MSE)

In [61]:
#MSE loss

preds - targets

tensor([[ 22.3711, 112.4325],
        [ 46.6053, 136.1517],
        [ 13.9815, 132.0025],
        [  3.1749, 164.5186],
        [ 64.9227,  96.2800]], grad_fn=<SubBackward0>)

In [62]:
diff = preds - targets
diff * diff

tensor([[5.0046e+02, 1.2641e+04],
        [2.1721e+03, 1.8537e+04],
        [1.9548e+02, 1.7425e+04],
        [1.0080e+01, 2.7066e+04],
        [4.2150e+03, 9.2698e+03]], grad_fn=<MulBackward0>)

In [63]:
torch.sum(diff * diff) / diff.numel()   #calculating the average 

tensor(9203.2256, grad_fn=<DivBackward0>)

In [64]:
#Using a function

def mse(t1, t2):
    diff = t1 - t2
    return torch.sum(diff * diff) / diff.numel()

In [65]:
#Computing the loss

loss = mse(preds, targets)
print(loss)

tensor(9203.2256, grad_fn=<DivBackward0>)


Looking at the resuly we can say that, On average, each element in the prediction differs from the actual target by the square root of the loss.

# Computing Gradients

In [66]:
loss.backward()

In [67]:
#Gradients for weights
print(w)
print(w.grad)

tensor([[-0.9381,  0.3945,  2.7540],
        [ 1.4411,  0.8046,  0.5659]], requires_grad=True)
tensor([[ 2378.8135,  2768.5505,  1883.5369],
        [11101.1621, 10703.9688,  6806.2480]])


In [68]:
print(b)
print(b.grad)

tensor([ 1.9935, -1.0061], requires_grad=True)
tensor([ 30.2111, 128.2771])


If the gradient is positive we can figure out from the graph that, we see that:
    - increasing the weight element's value slightly will increase the loss.
    - decreasing the weight element's value slightly will decrease the loss.

If the gradient is negative we can figure out from the graph that, we see that:
    - increasing the weight element's value slightly will decrease the loss.
    - decreasing the weight element's value slightly will increase the loss.

In [69]:
with torch.no_grad():
    w -= w.grad * 1e-5
    b -= b.grad * 1e-5

In [70]:
w,b

(tensor([[-0.9618,  0.3668,  2.7352],
         [ 1.3301,  0.6975,  0.4978]], requires_grad=True),
 tensor([ 1.9932, -1.0074], requires_grad=True))

In [71]:
#verifying that the loss is actually lower with new weights and biases
preds = model(inputs)
loss = mse(preds,targets)
print(loss)

tensor(6461.8682, grad_fn=<DivBackward0>)


In [72]:
w.grad.zero_()     #esle chai grad lai zero banaidincha. yo garena bhane chai next time gradient calculate garda agadikai value as a base liyera calculate gardincha.
b.grad.zero_()     #tei bhayera eslai zero raakheko

tensor([0., 0.])

In [73]:
print(w.grad)
print(b.grad)

tensor([[0., 0., 0.],
        [0., 0., 0.]])
tensor([0., 0.])


In [74]:
#aba we try to decrease the loss function even more

# Train the model using gradient descent
Steps:
1. Generate predictions
2. Calculate the loss
3. Compute gradients w.r.t the weights and biases
4. Adjust the weight by subtracting a small quantity proportional to the gradient
5. Reset the gradients to zero.

In [75]:
#Generate predictions

preds = model(inputs)
print(preds)

tensor([[ 73.9694, 164.2290],
        [121.7985, 213.2729],
        [126.1094, 237.0522],
        [ 20.8608, 183.0731],
        [162.3048, 192.5788]], grad_fn=<AddBackward0>)


In [76]:
#Calculate the loss

loss = mse(preds, targets)
print(loss)

tensor(6461.8682, grad_fn=<DivBackward0>)


In [77]:
#Compute gradients

loss.backward()
print(w.grad)
print(b.grad)

tensor([[1923.7539, 2278.2278, 1581.0623],
        [9224.8984, 8696.2129, 5565.5127]])
tensor([ 24.8086, 106.0412])


In [78]:
#Adjust weights and reset gradients

with torch.no_grad():
    w -= w.grad * 1e-5
    b -= b.grad * 1e-5
    w.grad.zero_()
    b.grad.zero_()

In [79]:
print(w)
print(b)

tensor([[-0.9811,  0.3441,  2.7194],
        [ 1.2378,  0.6106,  0.4422]], requires_grad=True)
tensor([ 1.9930, -1.0084], requires_grad=True)


In [80]:
#Calculate loss

preds = model(inputs)
loss = mse(preds, targets)
print(loss)

tensor(4612.0674, grad_fn=<DivBackward0>)


In [81]:
#We use function to repeat these process for 100 epochs

for i in range(100):
    preds = model(inputs)
    loss = mse(preds, targets)
    loss.backward()
    with torch.no_grad():
        w -= w.grad * 1e-5
        b -= b.grad * 1e-5
        w.grad.zero_()
        b.grad.zero_()

In [82]:
#Calculate loss
preds = model(inputs)
loss = mse(preds, targets)
print(loss)

tensor(358.5272, grad_fn=<DivBackward0>)


In [83]:
for i in range(200):
    preds = model(inputs)
    loss = mse(preds, targets)
    loss.backward()
    with torch.no_grad():
        w -= w.grad * 1e-5
        b -= b.grad * 1e-5
        w.grad.zero_()
        b.grad.zero_()
preds = model(inputs)
loss = mse(preds, targets)
print(loss)

tensor(150.4201, grad_fn=<DivBackward0>)


In [84]:
print(preds)

tensor([[ 55.8738,  72.1023],
        [ 92.9103,  98.3592],
        [ 96.4325, 135.3284],
        [ 12.8076,  48.0732],
        [125.7616, 108.4636]], grad_fn=<AddBackward0>)


In [85]:
print(targets)

tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]])
