<a href="https://colab.research.google.com/github/povashraful/sota_implementation/blob/main/Fundamental%20Deep%20Learning%20%26%20Optimization%20Algorithms/Simple_Gradient_Descent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#WorkFlow

simple (vanilla) Gradient Descent:

computed the forward pass (Y_hat = W*X + B).

Calculated the loss (Mean Squared Error).

Computed gradients manually (dW and dB).

Updated the parameters using gradient descent rule (W_new = W - lr * dW, B_new = B - lr* dB).

Repeated for multiple iterations.

# Method 1 (without early stopping)

In [5]:
import numpy as np

# Step 1: Initialize data points
X = np.array([1, 2, 3, 4, 5, 6], dtype=int)    # inputs - We can just change the values
Y = np.array([5, 8, 11, 14, 17, 20], dtype=int) # true labels - We can just change the values

# Step 2: Initialize parameters
W = 0.0
B = 0.0
learning_rate = 0.01
n_iterations = 50
n = len(X)

# Gradient Descent Loop
for i in range(n_iterations):
    print(f"\n================ Iteration {i+1} ================")

    # Step 1: Forward pass and error
    print("Step 1: Forward Pass and Error Computation")
    print(" X | Y  | Y_hat = W*X + B | Error = Y_hat - Y | dW_contrib = Error*X | dB_contrib = Error")
    print("--------------------------------------------------------------------------------------------")

    Y_hat_list = []
    Error_list = []
    dW_contrib_list = []
    dB_contrib_list = []

    for xi, yi in zip(X, Y):
        y_hat = W * xi + B
        error = y_hat - yi
        dw_contrib = error * xi
        db_contrib = error

        Y_hat_list.append(y_hat)
        Error_list.append(error)
        dW_contrib_list.append(dw_contrib)
        dB_contrib_list.append(db_contrib)

        # Print each calculation as an equation
        print(f"{xi:>2} | {yi:>2} | {y_hat:>6.2f} = {W:.2f}*{xi} + {B:.2f} | "
              f"{error:>6.2f} = {y_hat:.2f} - {yi} | {dw_contrib:>10.2f} = {error:.2f}*{xi} | {db_contrib:>9.2f} = {error:.2f}")

    # Step 2: Compute average gradients
    dW = (1/n) * np.sum(dW_contrib_list)
    dB = (1/n) * np.sum(dB_contrib_list)

    print("\nStep 2: Compute Average Gradients")
    print(f"dW = (1/n) * sum(dW_contrib) = (1/{n}) * {np.sum(dW_contrib_list):.2f} = {dW:.4f}")
    print(f"dB = (1/n) * sum(dB_contrib) = (1/{n}) * {np.sum(dB_contrib_list):.2f} = {dB:.4f}")

    # Step 3: Update parameters
    W_new = W - learning_rate * dW
    B_new = B - learning_rate * dB
    print("\nStep 3: Update Parameters")
    print(f"W_new = W - lr*dW = {W:.4f} - {learning_rate}*{dW:.4f} = {W_new:.4f}")
    print(f"B_new = B - lr*dB = {B:.4f} - {learning_rate}*{dB:.4f} = {B_new:.4f}")

    # Step 4: Compute loss
    Error_array = np.array(Error_list)
    loss = (1/(2*n)) * np.sum(Error_array**2)
    print(f"\nStep 4: Compute Loss")
    print(f"L = 1/(2n) * sum(Error^2) = 1/(2*{n}) * {np.sum(Error_array**2):.2f} = {loss:.4f}")

    # Update parameters for next iteration
    W = W_new
    B = B_new



Step 1: Forward Pass and Error Computation
 X | Y  | Y_hat = W*X + B | Error = Y_hat - Y | dW_contrib = Error*X | dB_contrib = Error
--------------------------------------------------------------------------------------------
 1 |  5 |   2.00 = 1.00*1 + 1.00 |  -3.00 = 2.00 - 5 |      -3.00 = -3.00*1 |     -3.00 = -3.00
 2 |  8 |   3.00 = 1.00*2 + 1.00 |  -5.00 = 3.00 - 8 |     -10.00 = -5.00*2 |     -5.00 = -5.00
 3 | 11 |   4.00 = 1.00*3 + 1.00 |  -7.00 = 4.00 - 11 |     -21.00 = -7.00*3 |     -7.00 = -7.00
 4 | 14 |   5.00 = 1.00*4 + 1.00 |  -9.00 = 5.00 - 14 |     -36.00 = -9.00*4 |     -9.00 = -9.00
 5 | 17 |   6.00 = 1.00*5 + 1.00 | -11.00 = 6.00 - 17 |     -55.00 = -11.00*5 |    -11.00 = -11.00
 6 | 20 |   7.00 = 1.00*6 + 1.00 | -13.00 = 7.00 - 20 |     -78.00 = -13.00*6 |    -13.00 = -13.00

Step 2: Compute Average Gradients
dW = (1/n) * sum(dW_contrib) = (1/6) * -203.00 = -33.8333
dB = (1/n) * sum(dB_contrib) = (1/6) * -48.00 = -8.0000

Step 3: Update Parameters
W_new = W - l

# Method 2 (with early stopping)

In [11]:
import numpy as np

# Step 1: Initialize data points
X = np.array([1, 2, 3, 4, 5, 6], dtype=int)    # inputs - We can just change the values
Y = np.array([5, 8, 11, 14, 17, 20], dtype=int) # true labels - We can just change the values

# Step 2: Initialize parameters
W = 0
B = 0
learning_rate = 0.01
n_iterations = 10000  # Set a very high number; early stopping will break before reaching this
n = len(X)
tolerance = 1e-6      # Early stopping tolerance
prev_loss = float('inf')

# Open a file to save full log
log_file = open("gradient_descent_log.txt", "w")

# Gradient Descent Loop
for i in range(n_iterations):
    print(f"\n================ Iteration {i+1} ================")
    log_file.write(f"\n================ Iteration {i+1} ================\n")

    # Step 1: Forward pass and error
    print("Step 1: Forward Pass and Error Computation")
    log_file.write("Step 1: Forward Pass and Error Computation\n")
    print(" X | Y  | Y_hat = W*X + B | Error = Y_hat - Y | dW_contrib = Error*X | dB_contrib = Error")
    log_file.write(" X | Y  | Y_hat = W*X + B | Error = Y_hat - Y | dW_contrib = Error*X | dB_contrib = Error\n")
    print("--------------------------------------------------------------------------------------------")
    log_file.write("--------------------------------------------------------------------------------------------\n")

    Y_hat_list = []
    Error_list = []
    dW_contrib_list = []
    dB_contrib_list = []

    for xi, yi in zip(X, Y):
        y_hat = W * xi + B
        error = y_hat - yi
        dw_contrib = error * xi
        db_contrib = error

        Y_hat_list.append(y_hat)
        Error_list.append(error)
        dW_contrib_list.append(dw_contrib)
        dB_contrib_list.append(db_contrib)

        # Print each calculation as an equation
        line = (f"{xi:>2} | {yi:>2} | {y_hat:>6.2f} = {W:.2f}*{xi} + {B:.2f} | "
                f"{error:>6.2f} = {y_hat:.2f} - {yi} | {dw_contrib:>10.2f} = {error:.2f}*{xi} | {db_contrib:>9.2f} = {error:.2f}")
        print(line)
        log_file.write(line + "\n")

    # Step 2: Compute average gradients
    dW = (1/n) * np.sum(dW_contrib_list)
    dB = (1/n) * np.sum(dB_contrib_list)

    print("\nStep 2: Compute Average Gradients")
    log_file.write("\nStep 2: Compute Average Gradients\n")
    print(f"dW = (1/n) * sum(dW_contrib) = (1/{n}) * {np.sum(dW_contrib_list):.2f} = {dW:.4f}")
    log_file.write(f"dW = (1/n) * sum(dW_contrib) = (1/{n}) * {np.sum(dW_contrib_list):.2f} = {dW:.4f}\n")
    print(f"dB = (1/n) * sum(dB_contrib) = (1/{n}) * {np.sum(dB_contrib_list):.2f} = {dB:.4f}")
    log_file.write(f"dB = (1/n) * sum(dB_contrib) = (1/{n}) * {np.sum(dB_contrib_list):.2f} = {dB:.4f}\n")

    # Step 3: Update parameters
    W_new = W - learning_rate * dW
    B_new = B - learning_rate * dB
    print("\nStep 3: Update Parameters")
    log_file.write("\nStep 3: Update Parameters\n")
    line = f"W_new = W - lr*dW = {W:.4f} - {learning_rate}*{dW:.4f} = {W_new:.4f}"
    print(line)
    log_file.write(line + "\n")
    line = f"B_new = B - lr*dB = {B:.4f} - {learning_rate}*{dB:.4f} = {B_new:.4f}"
    print(line)
    log_file.write(line + "\n")

    # Step 4: Compute loss
    Error_array = np.array(Error_list)
    loss = (1/(2*n)) * np.sum(Error_array**2)
    print(f"\nStep 4: Compute Loss")
    log_file.write("\nStep 4: Compute Loss\n")
    line = f"L = 1/(2n) * sum(Error^2) = 1/(2*{n}) * {np.sum(Error_array**2):.2f} = {loss:.6f}"
    print(line)
    log_file.write(line + "\n")

    # Step 5: Early stopping
    if abs(prev_loss - loss) < tolerance:
        print(f"\nEARLY STOPPING at iteration {i+1} — loss improvement < {tolerance}")
        log_file.write(f"\nEARLY STOPPING at iteration {i+1} — loss improvement < {tolerance}\n")
        break

    prev_loss = loss

    # Update parameters for next iteration
    W = W_new
    B = B_new

final_line = f"\nOptimal Parameters Found: W = {W:.4f}, B = {B:.4f}, Loss = {loss:.6f}"
print(final_line)
log_file.write(final_line + "\n")

log_file.close()



Step 1: Forward Pass and Error Computation
 X | Y  | Y_hat = W*X + B | Error = Y_hat - Y | dW_contrib = Error*X | dB_contrib = Error
--------------------------------------------------------------------------------------------
 1 |  5 |   0.00 = 0.00*1 + 0.00 |  -5.00 = 0.00 - 5 |      -5.00 = -5.00*1 |     -5.00 = -5.00
 2 |  8 |   0.00 = 0.00*2 + 0.00 |  -8.00 = 0.00 - 8 |     -16.00 = -8.00*2 |     -8.00 = -8.00
 3 | 11 |   0.00 = 0.00*3 + 0.00 | -11.00 = 0.00 - 11 |     -33.00 = -11.00*3 |    -11.00 = -11.00
 4 | 14 |   0.00 = 0.00*4 + 0.00 | -14.00 = 0.00 - 14 |     -56.00 = -14.00*4 |    -14.00 = -14.00
 5 | 17 |   0.00 = 0.00*5 + 0.00 | -17.00 = 0.00 - 17 |     -85.00 = -17.00*5 |    -17.00 = -17.00
 6 | 20 |   0.00 = 0.00*6 + 0.00 | -20.00 = 0.00 - 20 |    -120.00 = -20.00*6 |    -20.00 = -20.00

Step 2: Compute Average Gradients
dW = (1/n) * sum(dW_contrib) = (1/6) * -315.00 = -52.5000
dB = (1/n) * sum(dB_contrib) = (1/6) * -75.00 = -12.5000

Step 3: Update Parameters
W_new = 

In [4]:
from google.colab import files
files.download("gradient_descent_log.txt")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>