# Training a Logistic Regression Model: GD, SGD, and Solvers

.
## Overview
In this practice lab you will:
- Implement logistic regression training via Batch GD, Mini-batch GD, and SGD
- Explore learning rates, schedules, and early stopping
- Compare manual training with scikit-learn solvers (`lbfgs`, `liblinear`, `saga`)
- Analyze log loss curves and generalization with a validation split
- Practice threshold selection and interpretability of coefficients
.
## Key Concepts and Glossary References
- See: `Machine Learning with Python/notes/glossary-module-2.md` - Logistic Regression
- See: `Machine Learning with Python/notes/glossary-module-2.md` - Log Loss (Binary Cross-Entropy)
- See: `Machine Learning with Python/notes/glossary-module-2.md` - Gradient Descent
- See: `Machine Learning with Python/notes/glossary-module-2.md` - Batch Gradient Descent / Mini-batch / SGD
- See: `Machine Learning with Python/notes/glossary-module-2.md` - Learning Rate / Schedule / Early Stopping
- See: `Machine Learning with Python/notes/glossary-module-2.md` - `solver`, `penalty`, `C`, `class_weight`

In [None]:
# Run Me First: setup and packages
import numpy as np  # numerical computing
import pandas as pd  # data manipulation
import seaborn as sns  # visualization
import matplotlib.pyplot as plt  # plotting backend
from sklearn.model_selection import train_test_split  # data splitting
from sklearn.preprocessing import StandardScaler  # feature scaling
from sklearn.pipeline import Pipeline  # compose preprocessing + model
from sklearn.linear_model import LogisticRegression  # logistic model
from sklearn.metrics import log_loss, accuracy_score, precision_recall_fscore_support, roc_auc_score  # metrics

# reproducibility
np.random.seed(42)  # set global NumPy seed
PY_RANDOM_SEED = 42  # explicit constant to reuse in comments

# plotting defaults for consistent look
sns.set(style="whitegrid", context="notebook")  # seaborn theme
plt.rcParams["figure.figsize"] = (8, 5)  # default figure size
plt.rcParams["axes.titlesize"] = 14  # title size
plt.rcParams["axes.labelsize"] = 12  # axis label size

In [None]:
# Synthetic dataset: realistic e-commerce conversion scenario
# We simulate a binary outcome: whether a visitor converts (1) or not (0)
# Features: sessions, pages_per_session, is_mobile, is_campaign, time_on_site, country_US
n = 800  # number of samples
rng = np.random.default_rng(PY_RANDOM_SEED)  # random generator

# base features with reasonable distributions
sessions = rng.poisson(lam=3.5, size=n)  # visits per week
pages_per_session = rng.normal(loc=5.0, scale=1.2, size=n).clip(1, None)  # pages per visit
is_mobile = rng.binomial(1, 0.6, size=n)  # 60% mobile traffic
is_campaign = rng.binomial(1, 0.3, size=n)  # 30% currently in a campaign cohort
time_on_site = rng.gamma(shape=2.0, scale=90.0, size=n)  # seconds on site
country_US = rng.binomial(1, 0.65, size=n)  # 65% from US

# combine into design matrix (without intercept yet)
X = np.column_stack([sessions, pages_per_session, is_mobile, is_campaign, time_on_site, country_US])  # features matrix
feature_names = ["sessions", "pages_per_session", "is_mobile", "is_campaign", "time_on_site", "country_US"]  # names

# true underlying weights for logistic model (ground truth for simulation)
w_true = np.array([0.25, 0.35, -0.4, 0.9, 0.002, 0.15])  # coefficients
b_true = -2.0  # intercept term (bias)

# sigmoid helper
sigmoid = lambda z: 1.0 / (1.0 + np.exp(-z))  # maps real line to (0,1)

# generate probabilities then sample binary labels
logits = X @ w_true + b_true  # linear combination
proba = sigmoid(logits)  # convert to probabilities
# add a few outliers by flipping some labels to simulate noise
y = rng.binomial(1, proba)  # sample labels from Bernoulli
flip_idx = rng.choice(n, size=20, replace=False)  # choose indices to flip
y[flip_idx] = 1 - y[flip_idx]  # flip labels to add noise

# build DataFrame for EDA
raw_df = pd.DataFrame(X, columns=feature_names)  # create DataFrame
raw_df["converted"] = y  # add target column

# quick peek
raw_df.head()  # show first few rows

# Data dictionary (markdown)

#

| Column            | Type   | Description                                         |

|-------------------|--------|-----------------------------------------------------|

| sessions          | int    | Visits per week for the user                        |

| pages_per_session | float  | Average pages viewed per visit                      |

| is_mobile         | int    | 1 if mobile device, else 0                          |

| is_campaign       | int    | 1 if user part of current marketing campaign         |

| time_on_site      | float  | Total time on site this week (seconds)              |

| country_US        | int    | 1 if user is in the US, else 0                      |

| converted         | int    | Target: 1 if user converted, else 0                 |

In [None]:
# Train/validation/test split for honest evaluation
from sklearn.model_selection import StratifiedShuffleSplit  # stratified splitting

# create stratified splits to preserve class balance
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=PY_RANDOM_SEED)  # 80/20 split
idx_train_val, idx_test = next(sss.split(raw_df[feature_names], raw_df["converted"]))  # indices

df_train_val = raw_df.iloc[idx_train_val].reset_index(drop=True)  # training+validation
df_test = raw_df.iloc[idx_test].reset_index(drop=True)  # hold-out test set

# now split train into train/val
sss2 = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=PY_RANDOM_SEED)  # 75/25 of 80% -> 60/20/20 overall
idx_train, idx_val = next(sss2.split(df_train_val[feature_names], df_train_val["converted"]))  # indices

df_train = df_train_val.iloc[idx_train].reset_index(drop=True)  # 60%
df_val = df_train_val.iloc[idx_val].reset_index(drop=True)  # 20%

# extract arrays for modeling
X_train = df_train[feature_names].to_numpy()  # features train
y_train = df_train["converted"].to_numpy()  # labels train
X_val = df_val[feature_names].to_numpy()  # features val
y_val = df_val["converted"].to_numpy()  # labels val
X_test = df_test[feature_names].to_numpy()  # features test
y_test = df_test["converted"].to_numpy()  # labels test

# quick class balance check
print("Class balance (train/val/test):",  # print label means
      y_train.mean().round(3), y_val.mean().round(3), y_test.mean().round(3))

In [None]:
# EDA: quick distributions and correlations
ax = sns.histplot(df_train["pages_per_session"], kde=True)  # histogram with KDE
ax.set_title("Pages per session (train)")  # title
plt.show()  # render plot

ax = sns.scatterplot(x="time_on_site", y="pages_per_session", hue="converted", data=df_train, alpha=0.6)  # scatter
ax.set_title("Time on site vs Pages/session (colored by conversion)")  # title
plt.show()  # render plot

corr = df_train[feature_names + ["converted"]].corr(numeric_only=True)  # compute correlation matrix
sns.heatmap(corr, annot=False, cmap="coolwarm", center=0)  # heatmap
plt.title("Correlation heatmap (train)")  # title
plt.show()  # render plot

In [None]:
# Manual logistic regression with Batch GD, Mini-batch, and SGD
# Helper functions for sigmoid, log loss, and gradient
import math  # math utilities

def sigmoid_np(z: np.ndarray) -> np.ndarray:  # vectorized sigmoid
    return 1.0 / (1.0 + np.exp(-z))  # elementwise sigmoid

def add_intercept(Xm: np.ndarray) -> np.ndarray:  # add bias column of ones
    return np.c_[np.ones((Xm.shape[0], 1)), Xm]  # prepend ones column

def binary_log_loss(y_true: np.ndarray, y_prob: np.ndarray, eps: float = 1e-15) -> float:  # compute average log loss
    y_prob = np.clip(y_prob, eps, 1 - eps)  # avoid log(0)
    return float(-(y_true * np.log(y_prob) + (1 - y_true) * np.log(1 - y_prob)).mean())  # scalar loss

# Standardize features (important for gradient-based methods)
scaler = StandardScaler()  # create scaler
X_train_s = scaler.fit_transform(X_train)  # fit+transform train
X_val_s = scaler.transform(X_val)  # transform val
X_test_s = scaler.transform(X_test)  # transform test

# Add intercept
Xtr = add_intercept(X_train_s)  # train with intercept
Xv = add_intercept(X_val_s)  # val with intercept
Xte = add_intercept(X_test_s)  # test with intercept

# Initialize weights
n_features = Xtr.shape[1]  # number of columns incl. intercept
w0 = np.zeros(n_features)  # start from zeros

# Training loop factory
from typing import Tuple, List  # typing annotations

def train_logreg(
    X: np.ndarray, y: np.ndarray, X_val_: np.ndarray, y_val_: np.ndarray,
    w_init: np.ndarray, lr: float = 0.1, epochs: int = 200,
    batch_size: int | None = None, shuffle: bool = True,
    early_stopping: bool = True, patience: int = 10,
) -> Tuple[np.ndarray, List[float], List[float]]:
    """Train logistic regression with GD/SGD/mini-batch.
    - If batch_size is None or equals n, performs Batch GD.
    - If batch_size == 1, performs SGD.
    - Otherwise, performs Mini-batch GD.
    Returns: (weights, train_losses, val_losses)
    """
    rng_local = np.random.default_rng(PY_RANDOM_SEED)  # local RNG
    n = X.shape[0]  # number of samples
    bs = n if (batch_size is None) else batch_size  # effective batch size
    w = w_init.copy()  # copy initial weights
    best_val = math.inf  # best validation loss
    best_w = w.copy()  # best weights snapshot
    wait = 0  # patience counter
    train_losses, val_losses = [], []  # history

    for epoch in range(epochs):  # iterate over epochs
        indices = np.arange(n)  # indices 0..n-1
        if shuffle:  # shuffle each epoch
            rng_local.shuffle(indices)  # in-place shuffle
        # iterate over batches
        for start in range(0, n, bs):  # batch loop
            idx = indices[start:start + bs]  # batch indices
            Xb = X[idx]  # batch features
            yb = y[idx]  # batch labels
            # predictions and gradient
            p = sigmoid_np(Xb @ w)  # probs
            grad = Xb.T @ (p - yb) / Xb.shape[0]  # gradient of log loss
            w -= lr * grad  # gradient descent step
        # end of epoch: evaluate
        p_tr = sigmoid_np(X @ w)  # train probabilities
        p_va = sigmoid_np(X_val_ @ w)  # val probabilities
        tr_loss = binary_log_loss(y, p_tr)  # train loss
        va_loss = binary_log_loss(y_val_, p_va)  # val loss
        train_losses.append(tr_loss)  # log train loss
        val_losses.append(va_loss)  # log val loss
        # early stopping
        if early_stopping:
            if va_loss + 1e-8 < best_val:  # improvement check
                best_val = va_loss  # update best
                best_w = w.copy()  # snapshot
                wait = 0  # reset patience
            else:
                wait += 1  # increment patience
                if wait >= patience:  # stop if patience exceeded
                    w = best_w  # restore best weights
                    break  # exit training loop
    return w, train_losses, val_losses  # return results

# Run three modes: Batch GD, Mini-batch, and SGD
w_gd, tr_gd, va_gd = train_logreg(Xtr, y_train, Xv, y_val, w0, lr=0.2, epochs=300, batch_size=None, patience=15)  # Batch GD
w_mb, tr_mb, va_mb = train_logreg(Xtr, y_train, Xv, y_val, w0, lr=0.2, epochs=300, batch_size=64, patience=15)  # Mini-batch 64
w_sgd, tr_sgd, va_sgd = train_logreg(Xtr, y_train, Xv, y_val, w0, lr=0.05, epochs=300, batch_size=1, patience=20)  # pure SGD

# Plot validation loss curves to compare
plt.plot(va_gd, label="Batch GD")  # plot GD
plt.plot(va_mb, label="Mini-batch (64)")  # plot MB
plt.plot(va_sgd, label="SGD")  # plot SGD
plt.xlabel("Epoch")  # x label
plt.ylabel("Validation Log Loss")  # y label
plt.title("Validation loss by training method")  # title
plt.legend()  # legend
plt.show()  # render

In [None]:
# Evaluate manual model on test set (best of three by val loss)
# pick the best validation performer
val_losses = {"gd": va_gd[-1], "mb": va_mb[-1], "sgd": va_sgd[-1]}  # last-epoch val losses
best_key = min(val_losses, key=val_losses.get)  # argmin
w_best = {"gd": w_gd, "mb": w_mb, "sgd": w_sgd}[best_key]  # select weights

# test metrics
p_test = sigmoid_np(Xte @ w_best)  # test probabilities
pred_test = (p_test >= 0.5).astype(int)  # 0.5 threshold
acc = accuracy_score(y_test, pred_test)  # accuracy
prec, rec, f1, _ = precision_recall_fscore_support(y_test, pred_test, average="binary")  # PRF1
auc = roc_auc_score(y_test, p_test)  # ROC-AUC
ll = binary_log_loss(y_test, p_test)  # log loss
print({"best": best_key, "accuracy": round(acc, 3), "precision": round(prec, 3), "recall": round(rec, 3), "f1": round(f1, 3), "roc_auc": round(auc, 3), "log_loss": round(ll, 3)})  # results

In [None]:
# Compare with scikit-learn solvers and pipelines
solvers = ["lbfgs", "liblinear", "saga"]  # chosen solvers
results = []  # collect metrics
for solver in solvers:  # iterate options
    pipe = Pipeline([  # build pipeline
        ("scaler", StandardScaler()),  # scale features
        ("lr", LogisticRegression(solver=solver, penalty="l2", C=1.0, max_iter=2000, random_state=PY_RANDOM_SEED))  # model
    ])
    pipe.fit(df_train[feature_names], y_train)  # fit on train
    p_val = pipe.predict_proba(df_val[feature_names])[:, 1]  # val probabilities
    p_test = pipe.predict_proba(df_test[feature_names])[:, 1]  # test probabilities
    y_pred = (p_test >= 0.5).astype(int)  # threshold at 0.5
    res = {  # metrics dict
        "solver": solver,
        "val_log_loss": round(log_loss(y_val, p_val), 3),  # validation log loss
        "test_log_loss": round(log_loss(y_test, p_test), 3),  # test log loss
        "test_accuracy": round(accuracy_score(y_test, y_pred), 3),  # accuracy
        "roc_auc": round(roc_auc_score(y_test, p_test), 3),  # AUC
    }
    results.append(res)  # store
results  # show comparison table

In [None]:
# Learning rate schedule demo (time-based decay)
# η_t = η0 / (1 + decay * t)

def time_decay_lr(eta0: float, decay: float, t: int) -> float:  # schedule function
    return eta0 / (1.0 + decay * t)  # decayed LR

# train with schedule by updating lr each epoch
w_sched = w0.copy()  # init
eta0, decay = 0.3, 0.02  # initial lr and decay
train_hist, val_hist = [], []  # histories
for t in range(200):  # epochs
    # single batch GD for simplicity
    p = sigmoid_np(Xtr @ w_sched)  # probs
    grad = Xtr.T @ (p - y_train) / Xtr.shape[0]  # gradient
    lr_t = time_decay_lr(eta0, decay, t)  # lr at t
    w_sched -= lr_t * grad  # step
    # record losses
    train_hist.append(binary_log_loss(y_train, sigmoid_np(Xtr @ w_sched)))  # train loss
    val_hist.append(binary_log_loss(y_val, sigmoid_np(Xv @ w_sched)))  # val loss

# plot schedule curve
plt.plot(val_hist, label="Time decay schedule")  # plot
plt.xlabel("Epoch")  # x label
plt.ylabel("Validation Log Loss")  # y label
plt.title("Learning rate schedule: time-based decay")  # title
plt.legend()  # legend
plt.show()  # render

# Practice Exercises

1) Try three different learning rates for Batch GD (e.g., 0.01, 0.1, 0.5). Plot validation loss and explain which converges best and why.

2) Implement early stopping with patience=5 for Mini-batch GD and show the selected epoch. Compare to training without early stopping.

3) Compare solvers with `penalty='l1'` (where supported) and discuss sparsity of coefficients.

4) Tune threshold to maximize F1 on the validation set, then report test metrics at that threshold.

5) Optional: Add `class_weight='balanced'` and compare ROC-AUC on this dataset.

In [None]:
# Exercise 1: Try three learning rates for Batch GD
learning_rates = [0.01, 0.1, 0.5]  # candidate LRs
curves = {}  # store validation losses
for lr in learning_rates:  # iterate LRs
    w_tmp, _, vloss = train_logreg(Xtr, y_train, Xv, y_val, w0, lr=lr, epochs=200, batch_size=None, patience=15)  # train
    curves[lr] = vloss  # save curve
# plot curves
for lr, vloss in curves.items():  # iterate results
    plt.plot(vloss, label=f"lr={lr}")  # plot one curve
plt.title("Validation loss for different learning rates (Batch GD)")  # title
plt.xlabel("Epoch")  # x label
plt.ylabel("Val Log Loss")  # y label
plt.legend()  # legend
plt.show()  # render

In [None]:
# Exercise 2: Early stopping for Mini-batch GD (patience=5)
w_es, tr_es, va_es = train_logreg(Xtr, y_train, Xv, y_val, w0, lr=0.2, epochs=300, batch_size=64, patience=5)  # train with ES
plt.plot(va_es, label="Mini-batch with early stopping")  # plot
plt.xlabel("Epoch")  # x label
plt.ylabel("Val Log Loss")  # y label
plt.legend()  # legend
plt.show()  # render

In [None]:
# Exercise 3: L1 penalty (where supported) and sparsity
res_l1 = []  # store
for solver in ["liblinear", "saga"]:  # solvers supporting L1
    pipe = Pipeline([
        ("scaler", StandardScaler()),  # scale
        ("lr", LogisticRegression(solver=solver, penalty="l1", C=0.5, max_iter=2000, random_state=PY_RANDOM_SEED))  # L1
    ])
    pipe.fit(df_train[feature_names], y_train)  # fit
    coef = pipe.named_steps["lr"].coef_.ravel()  # coefficients
    sparsity = float((coef == 0).mean())  # fraction zero
    res_l1.append({"solver": solver, "sparsity": round(sparsity, 3), "coef": coef})  # record
res_l1  # inspect sparsity

In [None]:
# Exercise 4: Threshold tuning for F1 on validation set
pipe = Pipeline([("scaler", StandardScaler()), ("lr", LogisticRegression(solver="lbfgs", C=1.0, max_iter=2000, random_state=PY_RANDOM_SEED))])  # pipeline
pipe.fit(df_train[feature_names], y_train)  # fit on train
p_val = pipe.predict_proba(df_val[feature_names])[:, 1]  # val probs
best_f1, best_t = -1.0, 0.5  # init
for t in np.linspace(0.1, 0.9, 33):  # candidate thresholds
    y_hat = (p_val >= t).astype(int)  # predictions
    _, _, f1, _ = precision_recall_fscore_support(y_val, y_hat, average="binary")  # F1
    if f1 > best_f1:  # update best
        best_f1, best_t = float(f1), float(t)  # store
# evaluate on test at best threshold
p_test = pipe.predict_proba(df_test[feature_names])[:, 1]  # test probs
y_hat_test = (p_test >= best_t).astype(int)  # apply threshold
acc = accuracy_score(y_test, y_hat_test)  # accuracy
prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_hat_test, average="binary")  # PRF1
print({"best_threshold": round(best_t, 3), "test_accuracy": round(acc, 3), "precision": round(prec, 3), "recall": round(rec, 3), "f1": round(f1, 3)})  # report

# Solutions (expand if desired)
This section provides reference implementations matching the exercises above.

In [None]:
# Version info for reproducibility
import sys  # python runtime
import sklearn  # scikit-learn
import seaborn  # seaborn version
print({"python": sys.version.split()[0], "numpy": np.__version__, "pandas": pd.__version__, "sklearn": sklearn.__version__, "seaborn": seaborn.__version__})  # versions