<a href="https://colab.research.google.com/github/rahul0772/python-ml-ai-relearning/blob/main/AI%20and%20ML%20with%20PyTorch/day13_Pytorch_Basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pytorch Basics

In [None]:
# ============================================================
# PYTORCH FROM SCRATCH
# ============================================================


# ============================================================
# 1Ô∏è‚É£ WHAT IS PYTORCH?
# ============================================================

# PyTorch is a Python library used for:
# - Machine Learning
# - Deep Learning
# - Neural Networks
#
# PyTorch helps us:
# - Work with numbers (tensors)
# - Automatically calculate gradients (backpropagation)
# - Build and train neural networks easily
#
# BIG IDEA:
# PyTorch = NumPy + Automatic Differentiation + GPU support


# ============================================================
# 2Ô∏è‚É£ INSTALL & IMPORT PYTORCH
# ============================================================

# In Google Colab, PyTorch is already installed
# So we just import it

import torch

# torch is the main PyTorch library
# Everything we do will use this


# ============================================================
# 3Ô∏è‚É£ WHAT IS A TENSOR? (VERY IMPORTANT)
# ============================================================

# A tensor is like:
# - a number (0D tensor)
# - a list (1D tensor)
# - a table (2D tensor)
# - higher dimensional data (3D, 4D...)

# Think:
# Tensor = PyTorch version of NumPy array

# Creating a simple tensor (single number) / 0-dimensional tensor
# torch.tensor() takes Python data (number, list, list of lists) and converts it into a tensor, which is the basic data type PyTorch uses for all computations
# A TENSOR object that holds the value 5
a = torch.tensor(5)

# Print tensor
print("Tensor a:", a)

# Check type
print("Type of a:", type(a))


# ============================================================
# 4Ô∏è‚É£ 1D TENSOR (VECTOR)
# ============================================================

# A list of
# Create a 1D pytorch tensor contsaining the number 1 through 5 (a vector of length 5)
b = torch.tensor([1, 2, 3, 4, 5])

print("\nTensor b:", b)
print("Shape of b:", b.shape)

# shape tells how many elements and dimensions


# ============================================================
# 5Ô∏è‚É£ 2D TENSOR (MATRIX)
# ============================================================

# Like a table (rows and columns)
# Creates a 2√ó3 PyTorch tensor (a matrix) with 2 rows and 3 columns.
c = torch.tensor([
    [1, 2, 3],
    [4, 5, 6]
])

print("\nTensor c:\n", c)
print("Shape of c:", c.shape)

# Shape (2,3) means:
# 2 rows
# 3 columns


# ============================================================
# 6Ô∏è‚É£ CREATING TENSORS USING PYTORCH FUNCTIONS
# ============================================================

# Zeros tensor
# create a tensor/array filled with zero
zeros = torch.zeros(3, 3)                      # tensor will have 3 rows and 3 columns, filled entirely with zeros
print("\nZeros tensor:\n", zeros)

# Zeros tensor OP:
#  tensor([[0., 0., 0.],
#          [0., 0., 0.],
#          [0., 0., 0.]])

# Ones tensor
# create a tensor (arrray) filled with ones
ones = torch.ones(2, 2)                       # tensor will have 2 rows and 2 columns, and every element in it will be 1
print("\nOnes tensor:\n", ones)

# Ones tensor OP:
#  tensor([[1., 1.],
#          [1., 1.]])

# Random tensor
# create a tensor filled with random values between 0 and 1
random = torch.rand(2, 3)                     # tensor will have 2 rows and 3 columns, and each element will be a random number in the range [0, 1).
print("\nRandom tensor:\n", random)

# Random tensor:
#  tensor([[0.5311, 0.0732, 0.1155],
#          [0.7622, 0.2232, 0.1324]])


# ============================================================
# 7Ô∏è‚É£ BASIC TENSOR OPERATIONS
# ============================================================

# x and y are 1D tensors, both are vectors and each element in the tensors represents a single number
x = torch.tensor([1, 2, 3])
y = torch.tensor([4, 5, 6])

# Addition
# performs element-wise addition, means PyTorch will add corresponding elements from x and y eg.::-> 1 + 4 = 5
print("\nAddition:", x + y)

# Subtraction
# performs element-wise subtraction. Each element from x has the corresponding element from y subtracted eg.: 1 - 4 = -3
print("Subtraction:", x - y)

# Multiplication (element-wise)
# performs element-wise multiplication. Each element of x is multiplied by the corresponding element from y
# 1 * 4 = 4
print("Multiplication:", x * y)

# Division
# performs element-wise division. Each element of x is divided by the corresponding element from y
# 1 / 4 = 0.25
print("Division:", x / y)


# ============================================================
# 8Ô∏è‚É£ AUTOGRAD (MAGIC OF PYTORCH)
# ============================================================

# When a computer is trying to learn (like in machine learning or training a neural network), it starts with random guesses.
# The computer needs to adjust those guesses to get closer to the correct answer.
# Gradients help the computer figure out how and by how much to change its guesses to improve!

# Autograd automatically calculates gradients i.e it is like a Automatic Math Teacher (calculates how the function changes)
# Gradients are like the "directions" that help the model know how to change during training
# Gradients are needed for learning (training neural networks)
# When we're training neural networks or doing machine learning, the model needs to know how to adjust its values to make predictions better over time.
# Gradients tell the model: ‚ÄúHow do I change to get closer to the correct answer?‚Äù or "go in that direction! This way will make your answer better!""

# requires_grad=True tells PyTorch:
# "Track this tensor for gradient calculation"

# create a variable x that will hold a value that is a tensor (an array)
# 2.0 is a simple floating point number so tensor is just 2,0
# requires_grad = True --> track this number so that i want to calculate how it affects other numbers later on.
# In ML, we adjust our numbers (like x) to make predictions better over time.
# To do that, we need to know how changing x affects the result. requires_grad=True allows PyTorch to do this math!
x = torch.tensor(2.0, requires_grad=True)

# we know x = 2.0
# We do Simple math operation
y = x * x * 3   # y = 3x^2
# y = 12.0

# Backpropagation
# Backpropagation is a technique used to calculate gradients in machine learning
# Gradient is the direction and rate of change of a function, showing how much a model's parameters should be adjusted to minimize errors
# When you call y.backward(), PyTorch goes back through the math operations (like x * x * 3) and calculates how much y changes when x changes
# Or how fast y will change if x changes a little bit
# SO y = 3 * x¬≤, and calculated y = 12.0 for x = 2.0.
# calculated how much y would change if x changes (the gradient).
# At ùë•=2.0, the gradient (rate of change) is 12.0. This means if x changes slightly, y will change about 12 times that amount.
y.backward()

# Suppose
# x = torch.tensor(1.0, requires_grad=True): We create a tensor x that PyTorch will track for gradient computation.
# y = 4 * x**3 + 2 * x: This is our function.
# y.backward(): This tells PyTorch to compute the gradient of y with respect to x.
# x.grad: This contains the calculated gradient(dy/dx) at x = 1.0, which is 14.0.

# Gradient of y with respect to x
print("\nValue of x:", x)
print("y = 3x^2")
print("Gradient dy/dx:", x.grad)

# Math check:
# y = 3x^2
# dy/dx = 6x
# at x=2 ‚Üí 6*2 = 12 ‚úîÔ∏è


# ============================================================
# 9Ô∏è‚É£ SIMPLE LINEAR MODEL (y = wx + b)
# ============================================================

# This is the MOST BASIC neural network
# creating a basic linear model like a simple equation that makes prediction based on input data
# Equation: y=wx+b
# y is the output/prediction we want to make
# x is the input data or feature
# w is the weight or coefficient that multiplies the input. The weight controls how strongly x affects y.
# ----If w is large, even a small change in x will have a big impact on y.
# ----If w is small, then x doesn't influence y as much. It's essentially the "strength" of the relationship between x and y.
# b is the bias or intercept or starting point ( a base price for any house, no matter the size.)
# the bias is a value we can add to make the model more flexible
# b is like the initial value or the "offset" that makes the model more flexible. Without b, the line would always pass through the origin (0, 0), which might not match your data well

# Create parameters (weights)
# we want pytorch to trach changes to weignt(w) and bias(b) so we can update w later and track changes to b for later updates
w = torch.tensor(1.0, requires_grad=True)
b = torch.tensor(0.0, requires_grad=True)

# Input
# feeding the model a number
x = torch.tensor(2.0)

# Forward pass (prediction)
# In a forward pass, you're just calculating the output based on the current weights and biases without updating them. It's essentially running the model to make predictions.
# The goal is to find the values of w and b that make the model's predictions as accurate as possible by minimizing the error between y_pred and the actual target values
y_pred = w * x + b
# y_pred is predicted value

print("\nPrediction y:", y_pred)


# ============================================================
# üîü LOSS FUNCTION
# ============================================================

# Loss tells us "how wrong our model's prediction is compared to true values"
# Helps us figure out how much we missed the mark by, so we can improve over time.
# To minimize the loss, to make predictions that are as close as possible to the actual answers.

# True value
# It is the correct answer we are trying to predict
y_true = torch.tensor(4.0)

# Mean Squared Error (MSE)
# Loss fuction, simple way to measure diffrence between predicted values and true values
# MSE Formula: Loss = (ùë¶pred - ùë¶true)^2

# You first find the difference between the predicted value (y_pred) and the true value (y_true).
# Then you square that difference to get rid of any negative signs (since negative values would cancel out positive ones if not squared).
# Squaring also makes larger errors worse than smaller ones, so bigger mistakes are penalized more.
loss = (y_pred - y_true) ** 2

# EXAMPLE:
# When you calculate the difference and square it:
# (3.0 - 4.0) ** 2 = (-1.0) ** 2 = 1.0
# This means the loss is 1.0, indicating that the prediction is 1 unit away from the true value (which is 4.0).
# If your model had predicted exactly 4.0, the loss would have been 0, meaning no error.


# ============================================================
# 1Ô∏è‚É£1Ô∏è‚É£ BACKPROPAGATION (How wrong I was???) ‚ÄúWhat should I change so I miss LESS next time?‚Äù
# ============================================================

# # Backpropagation is a key algorithm for training neural networks.
# It allows the network to learn from the errors (loss) it makes, by adjusting the model's weights and biases to reduce the error in future predictions.
# This happens by calculating the gradients of the loss with respect to each parameter (weights and biases) and then using those gradients to make updates.

# The Flow of Backpropagation:
# Forward Pass: First, the model makes a prediction based on the current weights and biases (parameters).
# Loss Calculation: The loss function compares the model's prediction to the true value (label), and calculates how wrong the model was.
# Backpropagation (Gradient Calculation): Now, the model needs to figure out how to adjust the weights and biases. This is done by calculating the gradients of the loss with respect to each parameter (i.e., how much the loss would change if the parameter was adjusted slightly).
# Parameter Update: Using these gradients, we update the parameters to minimize the loss. This is typically done through gradient descent or a similar optimization technique.

# Calculate gradients
# Tells pytorch to calculate gradients if the loss with respect to each parameter(w, b, etc)
# automatic diffrentiation is used. PyTorch keeps track of the operations you performed (like addition, multiplication, etc.) and calculates how each parameter contributes to the final loss using the chain rule of calculus

# After calling loss.backward(), PyTorch computes the gradients of the loss with respect to each of the parameters in the model.
# The gradients are stored in the .grad attribute of the parameters.
# After calling loss.backward(), the gradients for each parameter (w and b) are calculated and stored in their .grad attribute

# These gradients will tell us:
# How to increase or decrease w and b to minimize the loss.
# Once we have the gradients, we can update the parameters (typically using an optimizer like Stochastic Gradient Descent (SGD)).
loss.backward()

# How much w caused the mistake
# How much b caused the mistake‚Äù
# w.grad ‚Üí how much w is to blame
# b.grad ‚Üí how much b is to blame
print("\nGradient of w:", w.grad)
print("Gradient of b:", b.grad)


# ============================================================
# 1Ô∏è‚É£2Ô∏è‚É£ MANUAL GRADIENT DESCENT(Fix it slowly)
# ============================================================

# In machine learning, we often use an optimization algorithm called Gradient Descent to minimize the loss (error) between the model's predictions and the actual outcomes.
# The basic idea is to adjust the model's parameters (weights w and bias b in this case) to make the model more accurate.
# Gradient descent does this by calculating the gradient (or derivative) of the loss function with respect to the parameters (weights and bias) and updating the parameters in the opposite direction of the gradient to reduce the loss.

# Learning rate
# It is a small number that controls how much the parameters (w and b) are adjusted during each update. A small learning rate means the parameters will change gradually, while a larger learning rate causes bigger steps, which can sometimes overshoot the optimal solution.
# In this case, lr = 0.01 means the parameters will be updated by 1% of the calculated gradient at each step.
lr = 0.01 # how aggressive should i be to see who caused it

######IMP Update parameters
# w and b are your model's parameters (weights and bias).
# w.grad and b.grad are the gradients (derivatives) of the loss function with respect to w and b. These gradients tell you the direction and magnitude of the change needed for each parameter to minimize the loss.

# Core concept, we subtract the gradient from w and b in the direction of steepest descent. That's why you have the - sign here. This means we're moving the parameters in the opposite direction of the gradient to reduce the loss.
# lr * w.grad is the size of the step you're taking in the direction of the gradient.
# The line w -= lr * w.grad updates w, and similarly, b -= lr * b.grad updates b.
# The with torch.no_grad() context ensures that the update to w and b is done without tracking these changes in the computation graph. This is important because you don‚Äôt want the update itself to be part of the calculation for the next gradient.

with torch.no_grad():
    w -= lr * w.grad
    b -= lr * b.grad

# with torch.no_grad(): ‚ÄúI‚Äôm just fixing stuff, chill.‚Äù and ‚ÄúDON‚ÄôT track this fix as part of learning math‚Äù
# w.grad says which direction is wrong
# lr controls how big the fix is
# -= means move in the opposite direction of wrong

# Reset gradients (VERY IMPORTANT)

# After updating w and b, we need to reset the gradients to zero.
# In PyTorch, gradients accumulate by default (i.e., they get added up every time .backward() is called).
# After each update step, we zero out the gradients to make sure that we don‚Äôt mix up the gradients of different iterations.
# This step is critical, otherwise, the gradients would keep accumulating and cause the parameter updates to be much larger than they should be.
w.grad.zero_()
b.grad.zero_()

# ‚ÄúForget the past. Fresh start.‚Äù
# Old mistakes + new mistakes = ‚ùå chaos
print("\nUpdated w:", w)
print("Updated b:", b)


# ============================================================
# 1Ô∏è‚É£3Ô∏è‚É£ TRAINING LOOP (REAL LEARNING)
# ============================================================

# We want a machine to figure out a rule by itself just by seeing examples.
# Simple dataset
# X = inputs we show the model
# Y = correct answers we WANT the model to say/predict
# Hidden rule (model does NOT know this): Y = 2 * X
X = torch.tensor([1.0, 2.0, 3.0, 4.0])
Y = torch.tensor([2.0, 4.0, 6.0, 8.0])

# Initialize parameters
# 2Ô∏è‚É£ MODEL BRAIN (STARTS DUMB)
# -----------------------------
# Model thinks world works like: y = w*x + b
# w = how strong x affects y
# b = starting offset/shifts the output up or down
# requires_grad=True means:
# "PyTorch, track these so we can learn"
# These two tensors are the only things that can change
# requires_grad=True tells PyTorch: ‚ÄúTrack every operation involving this value so we can later figure out how it affected the loss.‚Äù
# PyTorch builds a computation graph in the background.
# At this point, w = 0, b = 0: the model believes - output is always zero
w = torch.tensor(0.0, requires_grad=True)
b = torch.tensor(0.0, requires_grad=True)

# Training - ‚ÄúTry to learn once.‚Äù
# An epoch = one learning attempt
# Training happens over multiple epochs.
# Each epoch has 5 main steps.
for epoch in range(20):

    # ‚ñ∂Ô∏è FORWARD PASS (GUESS)
    # -------------------------
    # Model makes a prediction using current brain
    # At first: w=0, b=0 ‚Üí prediction = 0 (VERY WRONG)

    y_pred = w * X + b

    # üìâ LOSS (HOW WRONG?)
    # -------------------------
    # y_pred - Y  ‚Üí error; computes the error for each example
    # **2         ‚Üí punish big mistakes more, Squaring removes sign and amplifies large mistakes
    # mean()      ‚Üí one single "how dumb am I" number and  combines all errors into one scalar
    loss = ((y_pred - Y) ** 2).mean()
    This scalar loss:

      # Represents total failure
      # Is a function of w and b
      # Is connected to the computation graph
      # This is crucial:
      # Loss is not just a number ‚Äî it is a node in a graph linking back to w and b.

    # üîÑ BACKPROP (WHO IS WRONG?)
    # -------------------------
    # PyTorch goes backward and figures out:
    # "How much did w mess up?"
    # "How much did b mess up?"
    loss.backward()

    # üîß FIX THE BRAIN
    # -------------------------
    # torch.no_grad() = "I am fixing things manually"
    # 0.01 = learning rate (small baby steps)
    # -=    = move in opposite direction of mistake
    with torch.no_grad():
        w -= 0.01 * w.grad
        b -= 0.01 * b.grad

    # üßπ CLEAN UP (IMPORTANT)
    # -------------------------
    # Gradients ADD UP by default
    # If we don't reset ‚Üí chaos
    w.grad.zero_()
    b.grad.zero_()

    # üñ® SHOW PROGRESS
    # -------------------------
    # loss.item() turns tensor ‚Üí normal number
    print(f"Epoch {epoch+1}: Loss={loss.item():.4f}")

# 4Ô∏è‚É£ FINAL RESULT
# -----------------------------
# After learning, model should discover:
# w ‚âà 2 , b ‚âà 0
print("\nFinal w:", w.item())
print("Final b:", b.item())


# ============================================================
# üéâ CONGRATULATIONS
# ============================================================

# You just learned:
# - What PyTorch is
# - What tensors are
# - Autograd (backpropagation)
# - Loss functions
# - Gradient descent
# - Training a model from scratch
#
# NEXT STEPS (when you click "next"):
# - torch.nn
# - torch.optim
# - Real neural networks
# - CNNs, RNNs, Transformers
#
# SAVE THIS CELL AS YOUR NOTES ‚ù§Ô∏è
# ============================================================


Tensor a: tensor(5)
Type of a: <class 'torch.Tensor'>

Tensor b: tensor([1, 2, 3, 4, 5])
Shape of b: torch.Size([5])

Tensor c:
 tensor([[1, 2, 3],
        [4, 5, 6]])
Shape of c: torch.Size([2, 3])

Zeros tensor:
 tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])

Ones tensor:
 tensor([[1., 1.],
        [1., 1.]])

Random tensor:
 tensor([[0.8948, 0.4137, 0.5292],
        [0.0606, 0.0119, 0.6130]])

Addition: tensor([5, 7, 9])
Subtraction: tensor([-3, -3, -3])
Multiplication: tensor([ 4, 10, 18])
Division: tensor([0.2500, 0.4000, 0.5000])

Value of x: tensor(2., requires_grad=True)
y = 3x^2
Gradient dy/dx: tensor(12.)

Prediction y: tensor(2., grad_fn=<AddBackward0>)

Gradient of w: tensor(-8.)
Gradient of b: tensor(-4.)

Updated w: tensor(1.0800, requires_grad=True)
Updated b: tensor(0.0400, requires_grad=True)
Epoch 1: Loss=30.0000
Epoch 2: Loss=20.8350
Epoch 3: Loss=14.4755
Epoch 4: Loss=10.0626
Epoch 5: Loss=7.0006
Epoch 6: Loss=4.8757
Epoch 7: Loss=3.4013
Epoch 8