<a href="https://colab.research.google.com/github/rallm/IUST-DL-Fall2025/blob/main/HW3/helper/p2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

# --- Dataset Initialization ---
# Data points based on your request
x1_data = np.array([1, 2, 0, -1])
x2_data = np.array([-1, 0, 2, 1])
y_data = np.array([10, 13, 11, 4])

# Hyperparameters
learning_rate = 0.1
beta = 0.9
batch_size = 2

# --- Helper Functions ---

def predict(w1, w2, w3, b, x1, x2):
    """
    Computes the model output: y = w1*x1^2 + w2*x2^2 + w3*x1*x2 + b
    """
    return w1 * (x1**2) + w2 * (x2**2) + w3 * (x1 * x2) + b

def compute_gradients(w1, w2, w3, b, x1_batch, x2_batch, y_batch):
    """
    Computes the gradients for MSE loss.
    """
    n = len(y_batch)
    y_pred = predict(w1, w2, w3, b, x1_batch, x2_batch)
    errors = y_pred - y_batch

    # MSE Derivative: (2/n) * sum(error * derivative_of_term)
    grad_w1 = (2/n) * np.sum(errors * (x1_batch**2))
    grad_w2 = (2/n) * np.sum(errors * (x2_batch**2))
    grad_w3 = (2/n) * np.sum(errors * (x1_batch * x2_batch))
    grad_b  = (2/n) * np.sum(errors * 1)

    return grad_w1, grad_w2, grad_w3, grad_b, np.mean(errors**2) # returning MSE as well

In [2]:
print("--- Starting SGD Optimization ---")

# Reset weights to initial values
w1, w2, w3, b = 1.0, -1.0, -1.0, 1.0

# Number of batches
num_batches = len(y_data) // batch_size

for i in range(num_batches):
    # Slice the data for the current batch
    start_idx = i * batch_size
    end_idx = start_idx + batch_size

    x1_batch = x1_data[start_idx:end_idx]
    x2_batch = x2_data[start_idx:end_idx]
    y_batch = y_data[start_idx:end_idx]

    # Compute Gradients
    gw1, gw2, gw3, gb, loss = compute_gradients(w1, w2, w3, b, x1_batch, x2_batch, y_batch)

    print(f"\nBatch {i+1}:")
    print(f"  Samples used (indices): {start_idx} to {end_idx-1}")
    print(f"  Gradients -> w1: {gw1:.4f}, w2: {gw2:.4f}, w3: {gw3:.4f}, b: {gb:.4f}")

    # Update Weights (SGD Rule: w = w - lr * grad)
    w1 = w1 - learning_rate * gw1
    w2 = w2 - learning_rate * gw2
    w3 = w3 - learning_rate * gw3
    b  = b  - learning_rate * gb

    print(f"  New Weights -> w1: {w1:.4f}, w2: {w2:.4f}, w3: {w3:.4f}, b: {b:.4f}")

print("\n--- Final Weights (SGD) ---")
print(f"w1: {w1:.4f}, w2: {w2:.4f}, w3: {w3:.4f}, b: {b:.4f}")

--- Starting SGD Optimization ---

Batch 1:
  Samples used (indices): 0 to 1
  Gradients -> w1: -40.0000, w2: -8.0000, w3: 8.0000, b: -16.0000
  New Weights -> w1: 5.0000, w2: -0.2000, w3: -1.8000, b: 2.6000

Batch 2:
  Samples used (indices): 2 to 3
  Gradients -> w1: 5.2000, w2: -31.6000, w3: -5.2000, b: -4.0000
  New Weights -> w1: 4.4800, w2: 2.9600, w3: -1.2800, b: 3.0000

--- Final Weights (SGD) ---
w1: 4.4800, w2: 2.9600, w3: -1.2800, b: 3.0000


In [3]:
print("--- Starting SGD + Momentum Optimization ---")

# Reset weights to initial values
w1, w2, w3, b = 1.0, -1.0, -1.0, 1.0

# Initialize velocities (v) to 0
v_w1, v_w2, v_w3, v_b = 0.0, 0.0, 0.0, 0.0

for i in range(num_batches):
    # Slice the data for the current batch
    start_idx = i * batch_size
    end_idx = start_idx + batch_size

    x1_batch = x1_data[start_idx:end_idx]
    x2_batch = x2_data[start_idx:end_idx]
    y_batch = y_data[start_idx:end_idx]

    # Compute Gradients
    gw1, gw2, gw3, gb, loss = compute_gradients(w1, w2, w3, b, x1_batch, x2_batch, y_batch)

    print(f"\nBatch {i+1}:")
    print(f"  Gradients -> w1: {gw1:.4f}, w2: {gw2:.4f}, w3: {gw3:.4f}, b: {gb:.4f}")

    # Update Velocities (EMA Formula: v = beta * v + (1-beta) * grad)
    v_w1 = beta * v_w1 + (1 - beta) * gw1
    v_w2 = beta * v_w2 + (1 - beta) * gw2
    v_w3 = beta * v_w3 + (1 - beta) * gw3
    v_b  = beta * v_b  + (1 - beta) * gb

    print(f"  Velocities -> v_w1: {v_w1:.4f}, v_w2: {v_w2:.4f}, v_w3: {v_w3:.4f}, v_b: {v_b:.4f}")

    # Update Weights (Momentum Rule: w = w - lr * v)
    w1 = w1 - learning_rate * v_w1
    w2 = w2 - learning_rate * v_w2
    w3 = w3 - learning_rate * v_w3
    b  = b  - learning_rate * v_b

    print(f"  New Weights -> w1: {w1:.4f}, w2: {w2:.4f}, w3: {w3:.4f}, b: {b:.4f}")

print("\n--- Final Weights (SGD + Momentum) ---")
print(f"w1: {w1:.4f}, w2: {w2:.4f}, w3: {w3:.4f}, b: {b:.4f}")

--- Starting SGD + Momentum Optimization ---

Batch 1:
  Gradients -> w1: -40.0000, w2: -8.0000, w3: 8.0000, b: -16.0000
  Velocities -> v_w1: -4.0000, v_w2: -0.8000, v_w3: 0.8000, v_b: -1.6000
  New Weights -> w1: 1.4000, w2: -0.9200, w3: -1.0800, b: 1.1600

Batch 2:
  Gradients -> w1: -1.2800, w2: -55.3600, w3: 1.2800, b: -14.8000
  Velocities -> v_w1: -3.7280, v_w2: -6.2560, v_w3: 0.8480, v_b: -2.9200
  New Weights -> w1: 1.7728, w2: -0.2944, w3: -1.1648, b: 1.4520

--- Final Weights (SGD + Momentum) ---
w1: 1.7728, w2: -0.2944, w3: -1.1648, b: 1.4520
