# Diabetes classification
Predicting whether a person has any type of diabetes (yes/no) using a custom logistic regression trained on the BRFSS 2015 indicators.


In [None]:
import numpy as np
from pathlib import Path

DATA_PATH = Path("diabetes_012_health_indicators_BRFSS2015.csv")

raw = np.loadtxt(DATA_PATH, delimiter=",", skiprows=1)
X = raw[:, 1:]
y = (raw[:, 0] > 0).astype(np.float64)

print(f"Loaded {X.shape[0]} samples with {X.shape[1]} features")
print(f"Share of positive diabetes cases: {y.mean():.3f}")


In [None]:
rng = np.random.default_rng(42)
indices = np.arange(len(X))
rng.shuffle(indices)

split = int(0.8 * len(indices))
train_idx = indices[:split]
test_idx = indices[split:]

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

mean = X_train.mean(axis=0)
std = X_train.std(axis=0)
std[std == 0] = 1.0

X_train = (X_train - mean) / std
X_test = (X_test - mean) / std

print(f"Train size: {len(X_train)} | Test size: {len(X_test)}")


In [None]:
class LogisticRegressionGD:
    def __init__(self, lr=0.05, epochs=2000, reg=0.001):
        self.lr = lr
        self.epochs = epochs
        self.reg = reg
        self.w = None
        self.b = 0.0

    @staticmethod
    def _sigmoid(z):
        return 1.0 / (1.0 + np.exp(-z))

    def fit(self, X, y, verbose=False):
        n_samples, n_features = X.shape
        self.w = np.zeros(n_features)
        for epoch in range(1, self.epochs + 1):
            linear = X @ self.w + self.b
            preds = self._sigmoid(linear)
            error = preds - y

            grad_w = (X.T @ error) / n_samples + self.reg * self.w
            grad_b = error.mean()

            self.w -= self.lr * grad_w
            self.b -= self.lr * grad_b

            if verbose and epoch % 500 == 0:
                loss = -np.mean(
                    y * np.log(preds + 1e-8) + (1 - y) * np.log(1 - preds + 1e-8)
                )
                print(f"epoch {epoch:4d} | loss {loss:.4f}")
        return self

    def predict_proba(self, X):
        return self._sigmoid(X @ self.w + self.b)

    def predict(self, X):
        return (self.predict_proba(X) >= 0.5).astype(int)


In [None]:
model = LogisticRegressionGD()
model.fit(X_train, y_train, verbose=True)

def evaluate(name, X_split, y_split):
    preds = model.predict(X_split)
    acc = (preds == y_split).mean()
    tp = np.sum((preds == 1) & (y_split == 1))
    fp = np.sum((preds == 1) & (y_split == 0))
    fn = np.sum((preds == 0) & (y_split == 1))
    precision = tp / (tp + fp + 1e-8)
    recall = tp / (tp + fn + 1e-8)
    print(f"{name} accuracy : {acc:.3f}")
    print(f"{name} precision: {precision:.3f}")
    print(f"{name} recall   : {recall:.3f}\n")

evaluate("Train", X_train, y_train)
evaluate("Test", X_test, y_test)
