**Comparing Gradient Descent Variants for Linear Regression**

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Gradient Descent with Momentum
def train_gd(X, y, epochs=500, learning_rate=0.01, batch_size=None, momentum=0.0):
    num_features = X.shape[1]
    weights = np.random.randn(num_features, 1)  # Randomly initialize weights
    bias = np.random.randn(1)  # Randomly initialize bias
    velocity_w = np.zeros((num_features, 1))  # Keeps track of past weight updates
    velocity_b = 0  # Keeps track of past  updates
    loss_history = []  # Store loss values over epochs

    for _ in range(epochs):
        # If batch_size is set and smaller than the dataset then we pick random samples
        if batch_size and batch_size < len(y):
            indices = np.random.choice(len(y), batch_size, replace=False)
            X_batch = X[indices]
            y_batch = y[indices].reshape(-1, 1)
        else:
            X_batch = X
            y_batch = y.reshape(-1, 1)

        # Predict using current weights and bias
        predictions = np.dot(X_batch, weights) + bias
        errors = y_batch - predictions  # solve errors

        # solve gradients
        gradient_weights = -2 * np.dot(X_batch.T, errors) / len(X_batch)
        gradient_bias = -2 * np.mean(errors)

        # Apply momentum to smooth updates
        velocity_w = momentum * velocity_w + (1 - momentum) * gradient_weights
        velocity_b = momentum * velocity_b + (1 - momentum) * gradient_bias

        # Update weights n bias
        weights -= learning_rate * velocity_w
        bias -= learning_rate * velocity_b

        # solve loss i.e Mean Squared Error and store it
        loss = np.mean(errors ** 2)
        loss_history.append(loss)

    return weights, bias, loss_history

# Generate synthetic dataset
np.random.seed(42)
X = np.random.rand(2500, 2)
y = 3 * X[:, 0] + 5 * X[:, 1] + np.random.randn(2500) * 0.5  # Linear relation with some noise

# Train models with different batch sizes
w_batch, b_batch, loss_batch = train_gd(X, y, batch_size=None)  # Full-batch GD
w_sgd, b_sgd, loss_sgd = train_gd(X, y, batch_size=1)  # Stochastic GD
w_mini, b_mini, loss_mini = train_gd(X, y, batch_size=100)  # Mini-batch GD

# Train models with different learning rates
lr_values = [0.01, 0.05, 0.1]
losses_lr = {}

for lr in lr_values:
    _, _, losses_lr[lr] = train_gd(X, y, learning_rate=lr, batch_size=100)

# Train models with different momentum values
momentum_values = [0.0, 0.9]
losses_momentum = {}

for m in momentum_values:
    _, _, losses_momentum[m] = train_gd(X, y, learning_rate=0.05, batch_size=100, momentum=m)

# Plot for Loss comparison for Batch GD, SGD, and Mini-batch GD
plt.figure(figsize=(10, 5))
plt.plot(loss_batch, label="Batch GD")
plt.plot(loss_sgd, label="SGD")
plt.plot(loss_mini, label="Mini-batch GD")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.title("Loss Comparison: Batch vs. SGD vs. Mini-batch")
plt.show()

# Plot for Effect of Learning Rate on Convergence
plt.figure(figsize=(10, 5))
for lr, loss in losses_lr.items():
    plt.plot(loss, label=f"Learning Rate {lr}")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.title("Effect of Learning Rate on Convergence")
plt.show()

# Plot for Effect of Momentum on Convergence
plt.figure(figsize=(10, 5))
for m, loss in losses_momentum.items():
    plt.plot(loss, label=f"Momentum {m}")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.title("Effect of Momentum on Convergence")
plt.show()
