LOGISTIC REGRESSION

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, classification_report, roc_auc_score,
    roc_curve, f1_score
)

In [None]:
def train_logistic_regression(X_train, y_train):
  model_lr = LogisticRegression(
      max_iter=1000,
      solver='liblinear',
      class_weight='balanced',
      random_state=42
  )
  model_lr.fit(X_train, y_train)
  return model_lr


def find_best_threshold(y_true, y_probs, metric=f1_score):
    thresholds = np.linspace(0, 1, 101)
    scores = [metric(y_true, (y_probs >= t).astype(int)) for t in thresholds]
    best_threshold = thresholds[np.argmax(scores)]
    print(f"Best threshold (F1): {best_threshold:.2f}")
    return best_threshold


def evaluate_predictions(y_true, y_probs, threshold):
    preds = (y_probs >= threshold).astype(int)
    accuracy = accuracy_score(y_true, preds)
    auc = roc_auc_score(y_true, y_probs)
    fpr, tpr, _ = roc_curve(y_true, y_probs)

    print(f"\n🔎 Test Accuracy (threshold={threshold:.2f}): {accuracy:.2f}")
    print("Classification Report:")
    report_str = classification_report(y_true, preds, target_names=["Class 0", "Class 1"])
    print(report_str)
    report_df = pd.DataFrame(
        classification_report(y_true, preds, output_dict=True)
    ).T
    print(f"\nAUC-ROC (Test): {auc:.2f}")

    return preds, accuracy, auc, fpr, tpr, report_df


def get_feature_importance(model, feature_names, top_n=10):
  coeffs = model.coef_[0]
  importance_df = pd.DataFrame({
      'Feature': feature_names,
      'Coefficient': coeffs,
      'Abs_Coefficient': np.abs(coeffs)
  }).sort_values(by='Abs_Coefficient', ascending=False)

  print("\n📊 Top 10 Most Important Features (Logistic Regression):")
  print(importance_df[['Feature', 'Coefficient']].head(top_n))

  return importance_df


def plot_roc_curve(fpr, tpr, auc, title="ROC Curve - Logistic Regression"):
  plt.figure(figsize=(6, 4))
  plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
  plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
  plt.xlabel('False Positive Rate')
  plt.ylabel('True Positive Rate')
  plt.title(title)
  plt.legend()
  plt.grid(True)
  plt.tight_layout()
  plt.show()


def run_logistic_regression_pipeline(X_train, y_train, X_val, y_val, X_test, y_test, feature_names):
  model_lr = train_logistic_regression(X_train, y_train)

  val_probs = model_lr.predict_proba(X_val)[:, 1]
  best_threshold = find_best_threshold(y_val, val_probs)

  test_probs = model_lr.predict_proba(X_test)[:, 1]
  test_preds, accuracy, auc, fpr, tpr, report_df = evaluate_predictions(y_test, test_probs, best_threshold)
  plot_roc_curve(fpr, tpr, auc)  # ✅ Show ROC curve

  feature_importance = get_feature_importance(model_lr, feature_names)

  results_logreg = {
      'val_probs': val_probs,
      'test_probs': test_probs,
      'test_preds': test_preds,
      'accuracy': accuracy,
      'auc': auc,
      'best_threshold': best_threshold,
      'fpr': fpr,
      'tpr': tpr,
      'report_df': report_df,
      'feature_importance': feature_importance
  }

  return model_lr, results_logreg
