In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score
# %config InlineBackend.figure_format = 'svg' 
# plt.style.use('ggplot')

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
SAVE_PATH = '.\projects_temp-figs'

# Generate a synthetic binary classification dataset
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a logistic regression classifier
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Predict probabilities on the test set
y_pred = clf.predict_proba(X_test)[:, 1]  # probabilities for the positive class

# The y_test and y_pred can be used to test the plot_roc_and_calculate_auc function
(y_test, y_pred)


In [None]:
def my_plot_roc_binary(y_true, y_scores, save_path, figure_name):
    """
    Plots the Receiver Operating Characteristic (ROC) curve and calculates the 
    Area Under the Curve (AUC) for a binary classifier.
    
    The function takes two arguments: `y_true`, an array of true binary labels, 
    and `y_scores`, an array of predicted scores/probabilities from the classifier. 
    It calculates the True Positive Rate (TPR) and False Positive Rate (FPR) at 
    various threshold settings derived from the predicted scores. The ROC curve 
    is then plotted with TPR against FPR, and the AUC is calculated using the 
    trapezoidal rule.
    
    Parameters:
    y_true (array-like): True binary labels. Must be the same length as `y_scores`.
    y_scores (array-like): Predicted scores/probabilities for the positive class.
    
    Returns:
    float: The AUC (Area Under Curve) value, a measure of the classifier's performance.
    
    Note:
    This function assumes that the positive class is labeled as '1' and the negative
    class as '0'. The `y_scores` array should contain probabilities or scores that 
    reflect the likelihood of belonging to the positive class.
    """
    # Define the figure name and path
    my_save_path = os.path.join(save_path, figure_name)

    # Ensure the save directory exists
    os.makedirs(save_path, exist_ok=True)
    
    # Sort scores and corresponding true values
    desc_score_indices = np.argsort(y_scores)[::-1]
    y_scores = y_scores[desc_score_indices]
    y_true = y_true[desc_score_indices]

    # Temporarily ignore divide by zero warnings
    np.seterr(divide='ignore', invalid='ignore')
    
    # Compute True Positive and False Positive Counts
    tp = np.cumsum(y_true)  # Cumulative sum of true positives
    fp = np.cumsum(1 - y_true)  # Cumulative sum of false positives
    tn = fp[-1] - fp  # True negatives at each threshold
    fn = tp[-1] - tp  # False negatives at each threshold

    # Calculate TPR and FPR
    tpr = tp / (tp + fn)
    fpr = fp / (fp + tn)

    # Replace NaNs in FPR with zero
    fpr = np.nan_to_num(fpr)
    tpr = np.nan_to_num(tpr)

    # Restore the default error handling for NumPy
    np.seterr(divide='warn', invalid='warn')
    
    # Calculate AUC using the trapezoidal rule
    roc_auc = np.trapz(tpr, fpr)

    # Plotting ROC Curve
    plt.figure()
    plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
    plt.plot([0, 1], [0, 1], color='darkgray', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize=14)
    plt.ylabel('True Positive Rate', fontsize=14)
    plt.title('Receiver Operating Characteristic (ROC)', fontsize=16)
    plt.legend(loc="lower right", fontsize=14)

    plt.savefig(my_save_path, format='png', bbox_inches='tight')
    print(f"Plot saved to {save_path} as {figure_name}")
    # plt.gcf().set_dpi(200)
    plt.show()

    return roc_auc


In [None]:
my_auc = my_plot_roc_binary(y_test, y_pred, SAVE_PATH, figure_name='my_plot_roc_binary.png')

In [None]:
my_auc

In [None]:
def plot_roc_skl_binary(y_true, y_scores, save_path, figure_name):
    """
    Plots the Receiver Operating Characteristic (ROC) curve and calculates the
    Area Under the Curve (AUC) for a binary classifier using sklearn's roc_curve 
    and roc_auc_score functions.
    
    This function is designed to work with binary classification models. It takes
    two arguments: `y_true`, an array of true binary labels, and `y_scores`, an 
    array of predicted probabilities or decision function scores from the classifier. 
    It utilizes sklearn's roc_curve to calculate the True Positive Rate (TPR) and 
    False Positive Rate (FPR) at various thresholds, and plots these to produce the ROC curve.
    The AUC is calculated using sklearn's roc_auc_score.
    
    Parameters:
    y_true (array-like): True binary labels. Must be the same length as `y_scores`.
    y_scores (array-like): Predicted probabilities or decision function scores for
                           the positive class. Must be a 1D array.
    
    Returns:
    float: The AUC (Area Under Curve) value, indicating the performance of the classifier.
    
    Note:
    The function assumes a binary classification task where the positive class is labeled as '1'
    and the negative class as '0'. The `y_scores` should contain values that represent the
    likelihood of the positive class, typically ranging between 0 and 1.
    """
    # Define the figure name and path
    my_save_path = os.path.join(save_path, figure_name)

    # Ensure the save directory exists
    os.makedirs(save_path, exist_ok=True)

    # Calculate False Positive Rate and True Positive Rate
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)

    # Calculate AUC
    roc_auc = roc_auc_score(y_true, y_scores)

    # Plotting ROC Curve
    plt.figure()
    plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
    plt.plot([0, 1], [0, 1], color='darkgray', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.xlabel('False Positive Rate', fontsize=14)
    plt.ylabel('True Positive Rate', fontsize=14)
    plt.title('Receiver Operating Characteristic (ROC)', fontsize=16)
    plt.legend(loc="lower right", fontsize=14)

    plt.savefig(my_save_path, format='png', bbox_inches='tight')
    print(f"Plot saved to {save_path} as {figure_name}")
    # plt.gcf().set_dpi(200)
    plt.show()

    return roc_auc


In [None]:
auc_skl = plot_roc_skl_binary(y_test, y_pred, SAVE_PATH, figure_name='my_plot_roc_sk.png')

In [None]:
auc_skl

In [None]:
# Load the uploaded CSV file into a pandas DataFrame
dataset_out = 'med_records_results.csv'
folder_out = './projects_outscope'
file_path_out = os.path.join(folder_out, dataset_out)

data_frame = pd.read_csv(file_path_out)

# Extract the two tables from the data frame
y_test_out = data_frame[['y_test']].to_numpy()
y_pred_out = data_frame[['y_pred']].to_numpy()

# Display the first few rows of each table for confirmation
y_test_out[:5], y_pred_out[:5]

In [None]:
figure_name_out = 'my_roc_out.png'
my_auc_out = plot_roc_skl_binary(y_test_out, y_pred_out, folder_out, figure_name=figure_name_out)