In [1]:
from numpy import exp
import numpy as np
import pandas as pd
from tqdm import tqdm


DATA_PATH = "diabetic_data.csv"
RANDOM_SEED = 109  # for reproducible train/test split


def main():
    # ---- STEP 2–4: load, clean, feature-engineer, split ----
    train_x, train_y, test_x, test_y = load_and_preprocess()

    # add intercept term exactly like the class code
    train_x.insert(0, "intercept", 1)
    test_x.insert(0, "intercept", 1)

    # ---- train logistic regression (same style as class code) ----
    weights = train_logistic_regression(train_x, train_y)

    # ---- evaluate on test set ----
    test_correct = 0
    for x_i, y_i in zip(test_x.values, test_y):
        predicted_pr = predict(x_i, weights)
        predicted_label = 1 if predicted_pr > 0.5 else 0
        if predicted_label == y_i:
            test_correct += 1

    test_accuracy = test_correct / len(test_y)
    print(f"Test Accuracy: {test_accuracy:.5f}")

    # print learned weights
    for i, w in enumerate(weights):
        print(f"w{i}: {w:.7f}")


# ============================================================
# ===============  STEP 2–4: PREPROCESSING  ===================
# ============================================================

def load_and_preprocess():
    """
    Load the raw UCI diabetes dataset, clean it, choose features,
    and split into train/test sets.

    Returns:
        train_x: DataFrame of training features
        train_y: Series of training labels (0/1)
        test_x:  DataFrame of test features
        test_y:  Series of test labels (0/1)
    """

    # --------- 1. Load the raw CSV ---------
    df = pd.read_csv(DATA_PATH)

    # --------- 2. Keep only columns we actually want ---------
    # The full dataset has ~50 columns; we pick a smaller, reasonable subset
    cols_to_keep = [
        "race",
        "gender",
        "age",
        "time_in_hospital",
        "num_lab_procedures",
        "num_procedures",
        "num_medications",
        "number_outpatient",
        "number_emergency",
        "number_inpatient",
        "number_diagnoses",
        "max_glu_serum",
        "A1Cresult",
        "change",
        "diabetesMed",
        "readmitted",
    ]
    df = df[cols_to_keep]

    # --------- 3. Handle missing values ("?" in this dataset) ---------
    # In the UCI file, missing values are encoded as the string "?"
    df = df.replace("?", np.nan)

    # For simplicity: drop any rows that now have missing values
    # (the dataset is big, so we can afford to lose some rows)
    df = df.dropna()

    # --------- 4. Create a binary label column called "Label" ---------
    # readmitted column has 3 values: "<30", ">30", "NO"
    # We'll model: label = 1 if readmitted within 30 days, else 0.
    df["Label"] = df["readmitted"].apply(lambda v: 1 if v == "<30" else 0)

    # We no longer need the original string readmitted column
    df = df.drop(columns=["readmitted"])

    # --------- 5. Separate numeric and categorical features ---------
    numeric_cols = [
        "time_in_hospital",
        "num_lab_procedures",
        "num_procedures",
        "num_medications",
        "number_outpatient",
        "number_emergency",
        "number_inpatient",
        "number_diagnoses",
    ]

    categorical_cols = [
        "race",
        "gender",
        "age",
        "max_glu_serum",
        "A1Cresult",
        "change",
        "diabetesMed",
    ]

    # --------- 6. One-hot encode categorical columns ---------
    # This turns each category into 0/1 indicator columns, which fits the
    # logistic regression assumption in class (each x_j is just a number).
    df_encoded = pd.get_dummies(
        df,
        columns=categorical_cols,
        drop_first=True  # avoid redundant dummy column per feature
    )

    # --------- 7. Split into features X and label y ---------
    X = df_encoded.drop("Label", axis=1)
    y = df_encoded["Label"]

    # --------- 8. Train/test split (no sklearn, just numpy) ---------
    np.random.seed(RANDOM_SEED)

    n = len(X)
    indices = np.random.permutation(n)  # shuffled row indices

    train_size = int(0.8 * n)  # 80% train, 20% test
    train_idx = indices[:train_size]
    test_idx = indices[train_size:]

    train_x = X.iloc[train_idx].reset_index(drop=True)
    train_y = y.iloc[train_idx].reset_index(drop=True)
    test_x = X.iloc[test_idx].reset_index(drop=True)
    test_y = y.iloc[test_idx].reset_index(drop=True)

    return train_x, train_y, test_x, test_y


# ============================================================
# ===============  LOGISTIC REGRESSION CODE  =================
# ============================================================

def train_logistic_regression(train_x, train_y):
    n_iterations = 1000
    learning_rate = 0.0001

    m = len(train_x.columns)              # number of features (including intercept)
    weights = [0.0 for _ in range(m)]     # θ_j = 0 for all j

    for _ in tqdm(range(n_iterations)):
        # gradient[j] = 0 for all j
        gradients = [0.0 for _ in range(m)]

        # For each training example (x, y)
        for x_i, y_i in zip(train_x.values, train_y):
            predicted_pr = predict(x_i, weights)  # σ(θ^T x)

            # For each parameter j:
            #   gradient[j] += x_j * (y - σ(θ^T x))
            for j in range(m):
                gradients[j] += x_i[j] * (y_i - predicted_pr)

        # After accumulating the gradient over all data,
        # take a step in the gradient-ascent direction
        for j in range(m):
            weights[j] += learning_rate * gradients[j]

    return weights


def predict(x, weights):
    # Compute θ^T x (dot product)
    z = 0.0
    for x_i, w_i in zip(x, weights):
        z += x_i * w_i
    return sigmoid(z)


def sigmoid(z):
    # σ(z) = 1 / (1 + e^{−z})
    return 1 / (1 + exp(-z))


if __name__ == "__main__":
    main()


FileNotFoundError: [Errno 2] No such file or directory: 'diabetic_data.csv'