Rebekah Testing

WITHOUT CATEGORIES

In [None]:
X_train_proc_wo_cat = X_train_proc.drop(columns=cat_cols)
X_val_proc_wo_cat = X_val_proc.drop(columns=cat_cols)

In [None]:
import numpy as np
import pandas as pd
import shap
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, average_precision_score, make_scorer
from sklearn.model_selection import cross_val_score

# Define models
log_regs = {
    'lr1': LogisticRegression(penalty='l2', solver='newton-cg', class_weight={0:1, 1:250}, random_state=42, C=0.10),
    'lr2': LogisticRegression(penalty=None, solver='lbfgs', class_weight={0:1, 1:400}, random_state=42),
}

# SHAP feature counts to drop
n_values = [10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]

# Cross-validation config
kfold = PredefinedKFold(split_data)
recall_macro_scorer = make_scorer(recall_score, average='macro')

# To store results
results = []

# Loop over models
for model_name, base_model in log_regs.items():

    # Train and evaluate on full feature set
    model = base_model
    model.fit(X_train_proc_wo_cat, y_train)
    y_pred = model.predict(X_val_proc_wo_cat)

    # Cross-validated recall
    cv_scores = cross_val_score(model, X_train_proc_wo_cat, y_train, cv=kfold, scoring=recall_macro_scorer)

    results.append({
        'model': model_name,
        'reduced': False,
        'n_dropped': 0,
        'parameters': model.get_params(),
        'accuracy': accuracy_score(y_val, y_pred),
        'macro_recall': recall_score(y_val, y_pred, average='macro'),
        'pr_auc': average_precision_score(y_val, y_pred),
        'confusion_matrix': confusion_matrix(y_val, y_pred).tolist(),
        'cv_macro_recall_mean': cv_scores.mean(),
        'cv_macro_recall_per_fold': cv_scores.tolist()
    })

    # SHAP values for original model
    explainer = shap.Explainer(model, X_train_proc_wo_cat)
    shap_values = explainer(X_train_proc_wo_cat)
    feature_importance = np.abs(shap_values.values).mean(axis=0)
    importance_df = pd.DataFrame({
        'feature': X_train_proc_wo_cat.columns,
        'importance': feature_importance
    }).sort_values(by='importance', ascending=True)  # Least important first

    # Loop over SHAP thresholds (features to drop)
    for n in n_values:
        low_shap = importance_df['feature'].head(n).tolist()
        X_train_reduced = X_train_proc_wo_cat.drop(columns=low_shap)
        X_val_reduced = X_val_proc_wo_cat.drop(columns=low_shap)

        model = base_model.__class__(**base_model.get_params())
        model.fit(X_train_reduced, y_train)
        y_pred = model.predict(X_val_reduced)

        cv_scores = cross_val_score(model, X_train_reduced, y_train, cv=kfold, scoring=recall_macro_scorer)

        results.append({
            'model': model_name,
            'reduced': True,
            'n_dropped': n,
            'parameters': model.get_params(),
            'accuracy': accuracy_score(y_val, y_pred),
            'macro_recall': recall_score(y_val, y_pred, average='macro'),
            'pr_auc': average_precision_score(y_val, y_pred),
            'confusion_matrix': confusion_matrix(y_val, y_pred).tolist(),
            'cv_macro_recall_mean': cv_scores.mean(),
            'cv_macro_recall_per_fold': cv_scores.tolist()
        })

# Convert to DataFrame
results_wo_cat_df = pd.DataFrame(results)


# Display summary
#display(results_df[['model', 'reduced', 'n_dropped', 'accuracy', 'macro_recall', 'cv_macro_recall_mean', 'pr_auc']])


In [None]:
results_wo_cat_df

In [None]:
# Define models
log_regs = {
    'lr3': LogisticRegression(penalty='l1', solver='liblinear', class_weight={0:1, 1:250}, random_state=42, C=1),
    'lr4': LogisticRegression(penalty='l2', solver='lbfgs', class_weight={0:1, 1:400}, random_state=42, C=1)
}


# SHAP feature counts to drop
n_values = [10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80]

# Cross-validation config
kfold = PredefinedKFold(split_data)
recall_macro_scorer = make_scorer(recall_score, average='macro')

# To store results
results = []

# Loop over models
for model_name, base_model in log_regs.items():
  

    # Train and evaluate on full feature set
    model = base_model
    model.fit(X_train_proc, y_train)
    y_pred = model.predict(X_val_proc)

    # Cross-validated recall
    cv_scores = cross_val_score(model, X_train_proc, y_train, cv=kfold, scoring=recall_macro_scorer)

    results.append({
        'model': model_name,
        'reduced': False,
        'n_dropped': 0,
        'parameters': model.get_params(),
        'accuracy': accuracy_score(y_val, y_pred),
        'macro_recall': recall_score(y_val, y_pred, average='macro'),
        'pr_auc': average_precision_score(y_val, y_pred),
        'confusion_matrix': confusion_matrix(y_val, y_pred).tolist(),
        'cv_macro_recall_mean': cv_scores.mean(),
        'cv_macro_recall_per_fold': cv_scores.tolist()
    })

    # SHAP values for original model
    explainer = shap.Explainer(model, X_train_proc)
    shap_values = explainer(X_train_proc)
    feature_importance = np.abs(shap_values.values).mean(axis=0)
    importance_df = pd.DataFrame({
        'feature': X_train_proc.columns,
        'importance': feature_importance
    }).sort_values(by='importance', ascending=True)  # Least important first

    # Loop over SHAP thresholds (features to drop)
    for n in n_values:
        low_shap = importance_df['feature'].head(n).tolist()
        X_train_reduced = X_train_proc.drop(columns=low_shap)
        X_val_reduced = X_val_proc.drop(columns=low_shap)

        model = base_model.__class__(**base_model.get_params())
        model.fit(X_train_reduced, y_train)
        y_pred = model.predict(X_val_reduced)

        cv_scores = cross_val_score(model, X_train_reduced, y_train, cv=kfold, scoring=recall_macro_scorer)

        results.append({
            'model': model_name,
            'reduced': True,
            'n_dropped': n,
            'parameters': model.get_params(),
            'accuracy': accuracy_score(y_val, y_pred),
            'macro_recall': recall_score(y_val, y_pred, average='macro'),
            'pr_auc': average_precision_score(y_val, y_pred),
            'confusion_matrix': confusion_matrix(y_val, y_pred).tolist(),
            'cv_macro_recall_mean': cv_scores.mean(),
            'cv_macro_recall_per_fold': cv_scores.tolist()
        })

# Convert to DataFrame
results_w_cat_df = pd.DataFrame(results)


# Display summary
#display(results_df[['model', 'reduced', 'n_dropped', 'accuracy', 'macro_recall', 'cv_macro_recall_mean', 'pr_auc']])


      


In [None]:
results_w_cat_df