# Linear Regression from Scratch

Quick reference for plain linear regression, gradient descent, closed form, and a simple ridge variant. Focus is on behavior and assumptions, not performance.

In [1]:
import numpy as np

In [2]:
# Linear regression: y = w0 + w1 x
# Assume y = f(x; w) + eps with eps ~ N(0, sigma^2) → MSE is the NLL
# MSE still works without Gaussian noise; Gaussian just justifies it
# Logistic regression analog: Bernoulli likelihood → log loss

In [3]:
# Notes
# - Hypothesis: linear (or polynomial) functions of x
# - Loss: MSE minimized by gradient descent; quadratic → smooth/convex
# - Step size matters: too large can overshoot
# - Polynomial fits can overfit quickly; ridge/L2 can tame large weights
# - Closed form exists (normal equations); GD shown below

In [4]:
# Quick reminders
# - Basis can be [1, x, x^2, ...]; higher degrees increase variance
# - Squared loss is convex/smooth; GD converges with reasonable step size (Lipschitz)
# - Normal equations give the closed form; think projection of y onto span(X)
# - Small eigenvalues → ill-conditioned X^T X; ridge adds λ to stabilize and reduce variance

In [5]:
# Estimator variance is 1/curvature (eigenvalue) along each eigen-direction of X^T X
# Small eigenvalues = flat directions → large variance; ridge adds λ to lift them and reduce sensitivity

In [12]:
#Random guess at step size
np.random.seed(0)
X = np.linspace(0,10,10)
Y = 3*X + 2 + np.random.normal(0,0.5,size=len(X))

w0 = 0
w1 = 0
def prediction(X):
    return w1*X + w0

step_size = 0.001
cycles = 100
n_samples = len(X)
for _ in range(cycles):
    y_pred = prediction(X)
    dw0 = (2/n_samples)*np.sum(y_pred - Y)
    dw1 = (2/n_samples)*np.sum(X*(y_pred - Y))
    w0 -= step_size*dw0
    w1 -= step_size*dw1
print("Actual Values")
print(*X)
print("Predicted values")
print(*prediction(X))
mse_gd = np.mean((Y - prediction(X))**2)
print(f"MSE (Gradient Descent): {mse_gd:.6f}")

Actual Values
0.0 1.1111111111111112 2.2222222222222223 3.3333333333333335 4.444444444444445 5.555555555555555 6.666666666666667 7.777777777777779 8.88888888888889 10.0
Predicted values
0.5915961892133562 4.181580699028643 7.7715652088439295 11.361549718659218 14.951534228474504 18.54151873828979 22.131503248105076 25.721487757920364 29.31147226773565 32.90145677755093
MSE (Gradient Descent): 1.524966


In [7]:

# For MSE Loss: L = max eigenvalue of (2/n * X_augmented.T @ X_augmented)
X_aug = np.column_stack((np.ones(len(X)),X))
Hessian = (2/len(X))*(X_aug.T @ X_aug)
eigenvalues = np.linalg.eigvals(Hessian)
L = np.max(eigenvalues)
step_size = 1 / L

w0, w1 = 0, 0
cycles = 1000
def prediction(X, w0, w1):
    return w1*X + w0

for _ in range(cycles):
    y_pred = prediction(X, w0, w1)
    dw0 = (2/len(X))*np.sum(y_pred - Y)
    dw1 = (2/len(X))*np.sum(X*(y_pred - Y))
    w0 -= step_size*dw0
    w1 -= step_size*dw1

print(f"Lipschitz Constant (L): {L:.4f}")
print(f"Calculated Step Size: {step_size:.4f}")
print(f"Final Weights: w0={w0:.4f}, w1={w1:.4f}")
print(f"MSE: {np.mean((Y - prediction(X, w0, w1))**2):.6f}")

Lipschitz Constant (L): 71.8030
Calculated Step Size: 0.0139
Final Weights: w0=2.7510, w1=2.9236
MSE: 0.174109


In [None]:
#nth order regression
#Random guess at step size
step_size_poly = 0.00001
cycles_poly = 100
mse_poly = []
for n in range(3):
    w = np.zeros(n+1)
    X_poly = np.column_stack([X**i for i in range(n+1)])
    def predictor_n(X):
        return np.dot(X_poly,w)
    for _ in range(cycles):
        y_pred = predictor_n(X_poly)
        error = y_pred - Y
        dw = (2/n_samples)*np.dot(X_poly.T,error)
        w -= step_size*dw
    print(f"Predicted values for degree {n}:", predictor_n(X_poly))
    mse = np.mean((Y - predictor_n(X_poly))**2)
    print(f"MSE for degree {n}: {mse:.6f}")
    mse_poly.append(mse)

Predicted values for degree 0: [3.15131527 3.15131527 3.15131527 3.15131527 3.15131527 3.15131527
 3.15131527 3.15131527 3.15131527 3.15131527]
MSE for degree 0: 289.364625
Predicted values for degree 1: [ 0.59159619  4.1815807   7.77156521 11.36154972 14.95153423 18.54151874
 22.13150325 25.72148776 29.31147227 32.90145678]
MSE for degree 1: 1.524966
Predicted values for degree 2: [-1.15754584e+55 -1.05933110e+57 -3.99991788e+57 -8.83333579e+57
 -1.55595848e+58 -2.41786650e+58 -3.46905763e+58 -4.70953188e+58
 -6.13928924e+58 -7.75832971e+58]
MSE for degree 2: 1413151835874182194421437278450108850661531190643686600948241143319558619148900396884729747894618764346411324069117952.000000


In [9]:
#Closed-form 
#Add a dummy column to X for accomodating the +2 in Y = 3X+2
def closed_form_W(X,Y):
    return np.array((np.linalg.inv(X.T @ X)) @ X.T @ Y)
X_closed = np.column_stack((np.ones(len(X)),X))
W = closed_form_W(X_closed,Y)
print("Closed form solution")
mse_closed = np.mean((Y - X_closed@W)**2)
print(f"MSE (Closed Form): {mse_closed:.6f}")

Closed form solution
MSE (Closed Form): 0.174109


In [10]:
X_closed_ridge = np.column_stack((np.ones(len(X)),X))

def closed_form_ridge(X, Y, l2_penalty):
    n_features = X.shape[1]
    I = np.eye(n_features)
    I[0,0] = 0
    A = (X.T @ X) + (l2_penalty * I)
    return np.linalg.inv(A) @ X.T @ Y

W_no_ridge = closed_form_ridge(X_closed_ridge, Y, l2_penalty=0)
W_ridge = closed_form_ridge(X_closed_ridge, Y, l2_penalty=10.0)

print("Weights without Ridge:", W_no_ridge)
print("Weights with Ridge (L2):     ", W_ridge)
mse_no_ridge = np.mean((Y - X_closed_ridge@W_no_ridge)**2)
mse_ridge = np.mean((Y - X_closed_ridge@W_ridge)**2)
print(f"MSE without ridge: {mse_no_ridge:.6f}")
print(f"MSE with ridge: {mse_ridge:.6f}")

Weights without Ridge: [2.75181763 2.92343879]
Weights with Ridge (L2):      [4.05865285 2.66207175]
MSE without ridge: 0.174109
MSE with ridge: 0.869887
