Looking at Logistic Regression Metrics.

### Imports

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [2]:
# Load dataset
FILEPATH = '../../../../Datasets/Binary_predictors.csv'
data = pd.read_csv(FILEPATH)

# Encode categorical variables
data['Admitted'] = data['Admitted'].map({'Yes': 1, 'No': 0})
data['Gender'] = data['Gender'].map({'Female': 1, 'Male': 0})

# Split dataset into training and testing sets
train_data = data.sample(frac=0.8, random_state=2000)
test_data = data.drop(train_data.index)
# Define target variable and predictors

# The constant is added using sm.add_constant() to include an intercept term in the logistic regression model. statsmodels does not do this automatically.
X_train = sm.add_constant(train_data[['SAT', 'Gender']])
X_test = sm.add_constant(test_data[['SAT', 'Gender']])

y_train = train_data['Admitted']
y_test = test_data['Admitted']

### Stats model Metrics

In [None]:
# Fit logistic regression model
model = sm.Logit(y_train, X_train)
results = model.fit()

Optimization terminated successfully.
         Current function value: 0.093311
         Iterations 11
                           Logit Regression Results                           
Dep. Variable:               Admitted   No. Observations:                  134
Model:                          Logit   Df Residuals:                      131
Method:                           MLE   Df Model:                            2
Date:                Mon, 17 Mar 2025   Pseudo R-squ.:                  0.8636
Time:                        15:02:16   Log-Likelihood:                -12.504
converged:                       True   LL-Null:                       -91.669
Covariance Type:            nonrobust   LLR p-value:                 4.158e-35
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -91.2887     29.907     -3.052      0.002    -149.905     -32.672
SAT            0.0543      0

In [None]:
# Display regression summary
print(results.summary())

In [None]:
#Stats model Metrics
# Exponentiate coefficients for interpretation
exp_coef = np.exp(results.params)
print("Exponentiated Coefficients:", exp_coef)
# Compute training accuracy
def compute_accuracy(actual, predicted):
    """
    Compute accuracy given actual labels and predicted probabilities.
    """
    bins = np.array([0, 0.5, 1])
    cm = np.histogram2d(actual, predicted, bins=bins)[0]
    accuracy = (cm[0, 0] + cm[1, 1]) / cm.sum()
    return cm, accuracy

train_predictions = results.predict()
cm_train, accuracy_train = compute_accuracy(y_train, train_predictions)
print(f'Training Accuracy: {accuracy_train:.2f}')
# Compute confusion matrix and accuracy for test data
test_predictions = results.predict(X_test)
cm_test, accuracy_test = compute_accuracy(y_test, test_predictions)

# Format and display confusion matrix
cm_df = pd.DataFrame(cm_test, columns=['Predicted 0', 'Predicted 1'], index=['Actual 0', 'Actual 1'])
print("Confusion Matrix:")
print(cm_df)
# Display test accuracy and misclassification rate
print(f'Test Accuracy: {accuracy_test:.2f}')
print(f'Misclassification Rate: {1 - accuracy_test:.2f}')

In [5]:
# Exponentiate coefficients for interpretation
exp_coef = np.exp(results.params)
print("Exponentiated Coefficients:", exp_coef)

Exponentiated Coefficients: const     2.258522e-40
SAT       1.055779e+00
Gender    8.773478e+00
dtype: float64


In [6]:
# Compute training accuracy
def compute_accuracy(actual, predicted):
    """
    Compute accuracy given actual labels and predicted probabilities.
    """
    bins = np.array([0, 0.5, 1])
    cm = np.histogram2d(actual, predicted, bins=bins)[0]
    accuracy = (cm[0, 0] + cm[1, 1]) / cm.sum()
    return cm, accuracy

train_predictions = results.predict()
cm_train, accuracy_train = compute_accuracy(y_train, train_predictions)
print(f'Training Accuracy: {accuracy_train:.2f}')

Training Accuracy: 0.96


In [8]:
# Compute confusion matrix and accuracy for test data
test_predictions = results.predict(X_test)
cm_test, accuracy_test = compute_accuracy(y_test, test_predictions)

# Format and display confusion matrix
cm_df = pd.DataFrame(cm_test, columns=['Predicted 0', 'Predicted 1'], index=['Actual 0', 'Actual 1'])
print("Confusion Matrix:")
print(cm_df)

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         14.0          2.0
Actual 1          2.0         16.0


In [9]:
# Display test accuracy and misclassification rate
print(f'Test Accuracy: {accuracy_test:.2f}')
print(f'Misclassification Rate: {1 - accuracy_test:.2f}')

Test Accuracy: 0.88
Misclassification Rate: 0.12


### Sklearn model metrics

Will the metrics calculated by statsmodel and sklearn be the same/similar?

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [11]:
# Sklearn model fitting
model = LogisticRegression()
model.fit(X_train[['const', 'SAT', 'Gender']], y_train)  # Don't pass 'const' as it is manually added.

# Predictions
y_pred_prob = model.predict_proba(X_test[['const', 'SAT', 'Gender']])[:, 1]  # Probabilities for class 1
y_pred = (y_pred_prob >= 0.5).astype(int)  # Apply threshold to get predicted labels (0 or 1)

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix (Sklearn):")
print(cm)

# Compute Accuracy manually (same as in your statsmodels code)
accuracy = (cm[0, 0] + cm[1, 1]) / cm.sum()
print(f'Accuracy (Sklearn): {accuracy:.2f}')

# Misclassification rate
misclassification_rate = 1 - accuracy
print(f'Misclassification Rate (Sklearn): {misclassification_rate:.2f}')

Confusion Matrix (Sklearn):
[[14  2]
 [ 2 16]]
Accuracy (Sklearn): 0.88
Misclassification Rate (Sklearn): 0.12
