# Logistic Regression from Scratch

This notebook implements binary and multiclass logistic regression from scratch.
The focus is on understanding assumptions, loss functions, and optimization behavior,
not performance or abstractions.

In [107]:
import numpy as np

In [108]:
# Logistic regression: discriminative model
# - Directly models P(y | x), not the data distribution P(x | y)
# - Uses a linear score in x; sigmoid/softmax only map scores to (0, 1)
# - Trains by minimizing log-loss (penalizes confident wrong predictions)
# - Makes no explicit assumptions about how x is generated

# Loss intuition
# - For each sample, we want the predicted probability of the true class to be high
# - Log-loss is just the negative log-likelihood over all samples
# - Using log turns products into sums and magnifies large errors

In [109]:
# Binary logistic regression setup (data + helper)
X = np.array([[1.0, 1.0], [1.2, 0.8], [0.8, 1.3], [1.1, 1.4],
              [3.0, 3.1], [2.8, 2.9], [3.2, 2.7], [3.1, 3.3]])
Y = np.array([0, 0, 0, 0, 1, 1, 1, 1])

n_samples, n_features = X.shape


def sigmoid(z):
    out = np.zeros_like(z)
    pos = z >= 0
    neg = ~pos
    out[pos] = 1 / (1 + np.exp(-z[pos]))
    exp_z = np.exp(z[neg])
    out[neg] = exp_z / (1 + exp_z)
    return out


In [110]:
# Binary logistic regression (gradient descent)
# - Linear score z = wÂ·x + b, sigmoid maps z to probability
# - We update weights to minimize average log-loss over the data

In [111]:
classes = np.unique(Y)
n_samples, n_features = np.shape(X)
X0 = X[Y == 0]
X1 = X[Y == 1]
P_Y1 = len(X1) / len(X)

mu0 = np.mean(X0, axis=0)
mu1 = np.mean(X1, axis=0)
var = (np.var(X, axis=0) / 2) + 1e-9

# Weights derived from Gaussian Naive Bayes assumptions
w0 = np.log((1 - P_Y1) / P_Y1) + np.sum((mu1**2 - mu0**2) / (2 * var))
w1 = (mu0 - mu1) / var
X_new = np.array([[1.0, 1.0], [1.2, 0.8], [0.8, 1.3], [1.1, 1.4],
                  [3.0, 3.1], [2.8, 2.9], [3.2, 2.7], [3.1, 3.3]])

print("Binary predictions (0/1):")
for i in X_new:
    if (w0 + np.dot(w1, i)) > 0:
        print(0)
    else:
        print(1)

z = w0 + np.dot(X, w1)
y_pred = sigmoid(z)

loss = np.mean(Y * np.log(y_pred + 1e-9) + (1 - Y) * np.log(1 - y_pred + 1e-9))
print(f"Final Binary Cross Entropy loss: {-loss:.4f}")


Binary predictions (0/1):
0
0
0
0
1
1
1
1
Final Binary Cross Entropy loss: 7.6858


In [112]:
#Adding L2 regularization over weights and Lipschitz for finding optimal step size
#L is effectively how curved/sharp the loss surface can be
#Since loss now is loss_data  + loss_reg (reg_strength*w**2) we are adding a parabola on top of it so we need to accomodate the required step size to check that 
cycles = 100
reg_strength =  0.00001
L_constant = (1/(4*n_samples)) * np.linalg.norm(X.T @ X, ord=2) + (reg_strength)
step_size = 1/L_constant
w0 = 0.0
w1 = np.zeros(n_features)
for i in range(cycles):
    z = np.dot(X, w1) + w0
    y_pred = sigmoid(z)
    err = y_pred-Y
    dw1 = (1 / n_samples) * np.dot(X.T, err) + (reg_strength)*w1
    dw0 = (1 / n_samples) * np.sum(err)
    w0 = w0 - step_size * dw0
    w1 = w1 - step_size * dw1

X_new = np.array([[1.0, 1.0], [1.2, 0.8], [0.8, 1.3], [1.1, 1.4],
                  [3.0, 3.1], [2.8, 2.9], [3.2, 2.7], [3.1, 3.3]])

print("binary LR predictions (0/1):")
for x in X_new:
    z = np.dot(w1, x) + w0
    print(1 if z > 0 else 0)
loss = np.mean(Y * np.log(y_pred + 1e-9) + (1 - Y) * np.log(1 - y_pred + 1e-9))
print(f"Final Binary Cross Entropy loss: {-loss:.4f}")

binary LR predictions (0/1):
0
0
0
0
1
1
1
1
Final Binary Cross Entropy loss: 0.1383


In [113]:
# Multiclass logistic regression (softmax)
# - One linear score per class
# - Softmax turns scores into a probability distribution over classes
# - Trained with cross-entropy loss via gradient descent

cycles = 1000
X = np.array([[1.0, 1.0], [1.2, 0.8], [0.8, 1.3], [1.1, 1.4],
              [3.0, 3.1], [2.8, 2.9], [3.2, 2.7], [3.1, 3.3]])
Y = np.array([0, 0, 1, 1, 1, 2, 2, 2])
classes = np.unique(Y)
w0_mult = np.zeros((1, len(classes)))
w_mult = np.zeros((n_features, len(classes)))
step_size = 0.1
Y_multi = np.zeros((n_samples, len(classes)))
for idx, c in enumerate(classes):
    Y_multi[Y == c, idx] = 1


def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)


for i in range(cycles):
    scores = np.dot(X, w_mult) + w0_mult
    dominance = softmax(scores)
    err = dominance - Y_multi
    dw1 = (1 / n_samples) * np.dot(X.T, err)
    dw0 = (1 / n_samples) * np.sum(err, axis=0, keepdims=True)
    w0_mult -= step_size * dw0
    w_mult -= step_size * dw1

print("softmax class probabilities:")
print(dominance)
print("multiclass LR predictions:")
print(np.argmax(dominance, axis=1))
loss = np.mean(Y * np.log(y_pred + 1e-9) + (1 - Y) * np.log(1 - y_pred + 1e-9))
print(f"Final Binary Cross Entropy loss: {loss:.4f}")

softmax class probabilities:
[[0.61668664 0.33825478 0.04505858]
 [0.87722163 0.08435287 0.0384255 ]
 [0.15312848 0.82144171 0.02542982]
 [0.21419537 0.7288828  0.05692182]
 [0.00493907 0.29593047 0.69913046]
 [0.00917108 0.36269698 0.62813194]
 [0.01988921 0.07106701 0.90904378]
 [0.00247077 0.31242136 0.68510787]]
multiclass LR predictions:
[0 0 1 1 2 2 2 2]
Final Binary Cross Entropy loss: 0.4447


## Known limitations

- Fixed learning rate (no scheduling or adaptivity).
- ~~No regularization (L1/L2 not implemented).~~
- Stopping is based on a fixed iteration count, not a convergence test.
- No explicit train/validation split or evaluation metrics beyond basic sanity checks.