In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
    roc_curve,
    auc,
)
import shap
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import yaml
from ipywidgets import interact, widgets, VBox
from IPython.display import display, update_display
from typing import List, Tuple, Dict, Any

In [2]:
import os
from pathlib import Path

# Go up one directory level from the notebook's location
project_root = Path().resolve().parent  # Navigate to the parent directory
os.chdir(project_root)  # Set this as the working directory

# print("Current working directory set to:", os.getcwd())

In [3]:
with open("config/paths.yaml", "r") as file:
    paths = yaml.load(file, Loader=yaml.FullLoader)

# Load the data
X = pd.read_csv(paths["train"]["final_features"])
y = pd.read_csv(paths["train"]["labels"])
X_val = pd.read_csv(paths["val"]["final_features"])
y_val = pd.read_csv(paths["val"]["labels"])
# test_X = pd.read_csv(paths["test"]["final_features"])

clf_mask = pd.read_csv(paths["clf_mask_file"], index_col="label")
regr_mask = pd.read_csv(paths["regr_mask_file"], index_col="label")

# Path to stored classification models
svc_models_path =  os.path.normpath(os.path.join(paths["models"]["clf"], "SVC"))
rf_models_path = os.path.normpath(
    os.path.join(paths["models"]["clf"], "RandomForestClassifier")
)

# Path to stored regression models
kr_models_path =  os.path.normpath(os.path.join(paths["models"]["regr"], "KernelRidge"))

print(f"SVC: {svc_models_path}")
print(f"RandomForest: {rf_models_path}")
print(f"KernelRidge: {kr_models_path}")

SVC: models\classification\SVC
RandomForest: models\classification\RandomForestClassifier
KernelRidge: models\regression\KernelRidge


In [4]:
def load_models(dir: str) -> Dict:
    models = {}
    for file in os.listdir(dir):
        if file.endswith(".joblib"):
            model = joblib.load(os.path.join(dir, file))
            label = file.split(".")[0]
            models[label] = model
    return models

svc_models = load_models(svc_models_path)
rf_models = load_models(rf_models_path)
kr_models = load_models(kr_models_path)

print(f"SVC models: {len(svc_models)}")
print(f"RandomForest models: {len(rf_models)}")
print(f"KernelRidge models: {len(kr_models)}")

SVC models: 11
RandomForest models: 11
KernelRidge models: 4


## **Exploring Data Distribution**

Understanding the distribution of data is a critical step in analyzing and preparing it for machine learning tasks. Here, we focus on two key aspects:

### **1. Class Distribution**
- Visualizing the distribution of classes helps identify imbalances, which can impact model performance.
- Balanced datasets are ideal, but if imbalances exist, techniques like oversampling, undersampling, or class weighting might be needed.

### **2. Feature Distribution**
- Understanding how individual features are distributed within each class can provide insights into feature relevance and separability.
- Distribution analysis helps detect outliers, skewness, or patterns that might influence model training.

In [5]:
clf_labels = y.columns[1:12]

def plot_class_distribution(label_name):
    sns.countplot(
        x=y[label_name], palette="Set2", hue=y[label_name], dodge=False, legend=False
    )
    plt.title(f"Class Distribution for {label_name}")
    plt.xlabel("Class")
    plt.ylabel("Count")
    plt.show()


interact(plot_class_distribution, label_name=clf_labels)

interactive(children=(Dropdown(description='label_name', options=('LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LAB…

<function __main__.plot_class_distribution(label_name)>

In [6]:
"""
Plots the feature importance for a given label for the RandomForestClassifier model.
"""

def plot_feature_importance(label_name):
    model = rf_models[label_name]
    label_mask = clf_mask.loc[label_name]
    feature_names = label_mask[label_mask == True].index
    importances = model.feature_importances_  # For tree-based models
    indices = np.argsort(importances)[::-1]

    plt.figure(figsize=(10, 8))
    sns.barplot(
        x=importances[indices],
        y=np.array(feature_names)[indices],
        palette="viridis",
        hue=np.array(feature_names)[indices],
    )
    plt.title(f"Feature Importance for {label_name}")
    plt.xlabel("Importance")
    plt.ylabel("Feature")
    plt.show()


interact(plot_feature_importance, label_name=clf_labels)

interactive(children=(Dropdown(description='label_name', options=('LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LAB…

<function __main__.plot_feature_importance(label_name)>

In [13]:
def plot_shap_summary(label_name):
    print(f"Performing SHAP analysis for {label_name}")

    model = rf_models[label_name]
    label_mask = clf_mask.loc[label_name]
    X_val_selected = X_val.drop(columns=["pid"])
    val_data = X_val_selected.loc[:, label_mask]

    explainer = shap.TreeExplainer(model)  # For tree-based models
    shap_values = explainer.shap_values(val_data)[:,:,1]  # Get the SHAP values for the positive class

    shap.plots.violin(shap_values, feature_names=list(val_data.columns), max_display=10, show=False)
    plt.title(f"SHAP Feature Importance for {label_name}", fontsize=16, pad=30)
    plt.show()

interact(plot_shap_summary, label_name=clf_labels)

interactive(children=(Dropdown(description='label_name', options=('LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LAB…

<function __main__.plot_shap_summary(label_name)>

In [8]:
from src.helper import standardize_data


def predict(X_train, X_val, feature_mask, model, prob: bool = False):
    X_train_selected = X_train.loc[:, feature_mask]
    X_val_selected = X_val.loc[:, feature_mask]

    norm_X_train, norm_X_val = standardize_data(X_train_selected, X_val_selected)

    if prob:
        y_pred = model.predict_proba(norm_X_val)[:, 1] # Probability of positive class
    else:
        y_pred = model.predict(norm_X_val)

    return y_pred

In [9]:
def compute_clf_metrics(
    X: pd.DataFrame,
    X_val: pd.DataFrame,
    y_val: pd.DataFrame,
    mask: pd.DataFrame,
    models: Dict,
    out_file: str,
) -> Dict:

    metrics = {}

    X = X.drop(columns=["pid"])
    X_val = X_val.drop(columns=["pid"])

    for label, model in models.items():
        y_true = y_val[label]
        label_mask = mask.loc[label, :]
        y_pred = predict(X, X_val, label_mask, model)
        y_prob = predict(X, X_val, label_mask, model, prob=True)

        # Compute metrics
        acc = accuracy_score(y_true, y_pred)
        prec = precision_score(y_true, y_pred, zero_division=0)
        rec = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)

        fpr, tpr, _ = roc_curve(y_true, y_prob)
        auc_score = auc(fpr, tpr)

        # Compute confusion matrix
        cm = confusion_matrix(y_true, y_pred)

        metrics[label] = {
            "Accuracy": acc,
            "Precision": prec,
            "Recall": rec,
            "F1 Score": f1,
            "Confusion": cm,
            "ROC": (fpr, tpr),
            "AUC": auc_score,
        }

    # Convert metrics to a DataFrame for summary
    metrics_df = pd.DataFrame(metrics)
    metrics_df.to_csv(out_file)

    return metrics

In [10]:
svc_metrics_file = paths["evaluation"]["svc"]
rf_metrics_file = paths["evaluation"]["random_forest"]

svc_metrics = compute_clf_metrics(X, X_val, y_val, clf_mask, svc_models, svc_metrics_file)
rf_metrics = compute_clf_metrics(X, X_val, y_val, clf_mask, rf_models, rf_metrics_file)

print(f"SVC: {svc_metrics}")
print(f"RandomForest: {rf_metrics}")

SVC: {'LABEL_Alkalinephos': {'Accuracy': 0.6751776783364043, 'Precision': np.float64(0.39011703511053314), 'Recall': np.float64(0.6696428571428571), 'F1 Score': np.float64(0.4930156121610518), 'Confusion': array([[1965,  938],
       [ 296,  600]]), 'ROC': (array([0.00000000e+00, 3.44471237e-04, 3.44471237e-04, ...,
       9.96899759e-01, 9.96899759e-01, 1.00000000e+00]), array([0.        , 0.        , 0.00223214, ..., 0.99888393, 1.        ,
       1.        ])), 'AUC': np.float64(0.7280676393878254)}, 'LABEL_AST': {'Accuracy': 0.6607001842590156, 'Precision': np.float64(0.38504326328800986), 'Recall': np.float64(0.6793893129770993), 'F1 Score': np.float64(0.4915187376725838), 'Confusion': array([[1887,  995],
       [ 294,  623]]), 'ROC': (array([0.00000000e+00, 3.46981263e-04, 3.46981263e-04, ...,
       9.98265094e-01, 9.98265094e-01, 1.00000000e+00]), array([0.        , 0.        , 0.00218103, ..., 0.99890949, 1.        ,
       1.        ])), 'AUC': np.float64(0.7227381324461915)

In [11]:
# Interactive plot function
def plot_metrics(label_name):

    label_metrics = rf_metrics[label_name]
    confusion_matrix = label_metrics["Confusion"]

    # Create a plot with two sections: heatmap and text
    fig, ax = plt.subplots(2, 2, figsize=(12, 6), gridspec_kw={"width_ratios": [1, 1]})

    # Plot metrics for Random Forest
    sns.heatmap(
        confusion_matrix,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=["False", "True"],
        yticklabels=["False", "True"],
        ax=ax[0,0],
    )
    ax[0,0].set_title(f"Confusion Matrix")
    ax[0,0].set_xlabel("Predicted")
    ax[0,0].set_ylabel("True")

    # Extract other metrics (accuracy, precision, recall, F1-score)
    accuracy = label_metrics["Accuracy"]
    precision = label_metrics["Precision"]
    recall = label_metrics["Recall"]
    f1 = label_metrics["F1 Score"]

    # Prepare metrics text
    metrics_text = (
        f"Accuracy:  {accuracy:.2f}\n"
        f"Precision: {precision:.2f}\n"
        f"Recall:    {recall:.2f}\n"
        f"F1 Score:  {f1:.2f}"
    )

    # Add metrics text to the second subplot
    ax[0,1].axis("off")  # Turn off axis for text display
    ax[0,1].text(
        0.5,
        0.5,
        metrics_text,
        fontsize=14,
        ha="center",
        va="center",
        bbox=dict(boxstyle="round", facecolor="white", edgecolor="black"),
    )

    # Plot metrics for SVC
    label_metrics = svc_metrics[label_name]
    confusion_matrix = label_metrics["Confusion"]

    sns.heatmap(
        confusion_matrix,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=["False", "True"],
        yticklabels=["False", "True"],
        ax=ax[1,0],
    )
    ax[1,0].set_title(f"Confusion Matrix")
    ax[1,0].set_xlabel("Predicted")
    ax[1,0].set_ylabel("True")

    # Extract other metrics (accuracy, precision, recall, F1-score)
    accuracy = label_metrics["Accuracy"]
    precision = label_metrics["Precision"]
    recall = label_metrics["Recall"]
    f1 = label_metrics["F1 Score"]

    # Prepare metrics text
    metrics_text = (
        f"Accuracy:  {accuracy:.2f}\n"
        f"Precision: {precision:.2f}\n"
        f"Recall:    {recall:.2f}\n"
        f"F1 Score:  {f1:.2f}"
    )

    # Add metrics text to the second subplot
    ax[1,1].axis("off")  # Turn off axis for text display
    ax[1,1].text(
        0.5,
        0.5,
        metrics_text,
        fontsize=14,
        ha="center",
        va="center",
        bbox=dict(boxstyle="round", facecolor="white", edgecolor="black"),
    )

    # Add row labels for "Random Forest" and "SVC"
    ax[0, 0].text(
        -0.5,
        1.25,
        "Random Forest",
        transform=ax[0, 0].transAxes,
        fontsize=14,
        fontweight="bold",
    )
    ax[1, 0].text(
        -0.5, 1.25, "SVC", transform=ax[1, 0].transAxes, fontsize=14, fontweight="bold"
    )

    fig.suptitle(f"Classification metrics comparison of {label_name}")

    # Display the plot
    plt.tight_layout()
    plt.show()


# Create interactive selector at the bottom
label_selector = widgets.Dropdown(
    options=list(svc_metrics.keys()),
    description="Label:",
    style={"description_width": "initial"},
)
out = widgets.interactive_output(plot_metrics, {"label_name": label_selector})
ui = VBox([out, label_selector])
display(ui)

VBox(children=(Output(), Dropdown(description='Label:', options=('LABEL_Alkalinephos', 'LABEL_AST', 'LABEL_Bas…

In [12]:
def plot_roc_curve(label_name):
    # Create a plot with two sections: ROC curve and AUC score
    fig, ax = plt.subplots(1, 2, figsize=(12, 6), gridspec_kw={"width_ratios": [1, 1]})

    # Plot ROC curve for Random Forest and SVC

    # Random Forest
    label_metrics = rf_metrics[label_name]
    fpr, tpr = label_metrics["ROC"]
    ax[0].plot(fpr, tpr, label="Random Forest", color="blue")
    ax[0].fill_between(fpr, tpr, color="blue", alpha=0.2)

    # SVC
    label_metrics = svc_metrics[label_name]
    fpr, tpr = label_metrics["ROC"]
    ax[0].plot(fpr, tpr, label="SVC", color="orange")
    ax[0].fill_between(fpr, tpr, color="orange", alpha=0.2)

    # Figure settings
    ax[0].plot([0, 1], [0, 1], linestyle="--", label="Random Classifier", color="gray")
    ax[0].set_xlabel("False Positive Rate")
    ax[0].set_ylabel("True Positive Rate")
    ax[0].set_title(f"ROC Curve for {label_name}")
    ax[0].legend(loc="lower right")
    ax[0].grid(True)

    # Plot AUC score for Random Forest and SVC
    auc_text = (
        f"Random Forest: {rf_metrics[label_name]['AUC']:.2f}\n"
        f"SVC: {svc_metrics[label_name]['AUC']:.2f}"
    )

    ax[1].axis("off")  # Turn off axis for text display
    ax[1].text(
        0.5,
        0.5,
        auc_text,
        fontsize=14,
        ha="center",
        va="center",
        bbox=dict(boxstyle="round", facecolor="white", edgecolor="black"),
    )

    plt.show()


interact(plot_roc_curve, label_name=clf_labels)

interactive(children=(Dropdown(description='label_name', options=('LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LAB…

<function __main__.plot_roc_curve(label_name)>