In [5]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
import shap
import joblib
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
import os

# --- Configuration ---
# Define the root directory of your project.
# This assumes the script is run from a location where 'data/', 'models/', 'shap_plots/'
# can be found relative to this project_root.
project_root = '/Users/sangeethgeorge/MyProjects/oncoai-patient-outcome-navigator'

# Define paths for data, models, and SHAP plots
data_file_path = os.path.join(project_root, "data", "onco_features_cleaned.parquet")
model_save_base_path = os.path.join(project_root, "models")
shap_plots_base_path = os.path.join(project_root, "shap_plots")
mlruns_path = os.path.join(project_root, "mlruns")

# Ensure necessary directories exist
os.makedirs(os.path.dirname(data_file_path), exist_ok=True)
os.makedirs(model_save_base_path, exist_ok=True)
os.makedirs(shap_plots_base_path, exist_ok=True)
os.makedirs(mlruns_path, exist_ok=True) 

# Ensure the .trash directory exists within mlruns
os.makedirs(os.path.join(mlruns_path, ".trash"), exist_ok=True) # ADD THIS LINE


# --- Data Loading Function ---
def load_dataset(path: str = data_file_path) -> pd.DataFrame:
    """
    Loads the dataset from a specified parquet file.

    Args:
        path (str): The path to the parquet file.

    Returns:
        pd.DataFrame: The loaded DataFrame, or an empty DataFrame if the file is not found.
    """
    try:
        df = pd.read_parquet(path)
        print(f"✅ Dataset loaded successfully from {path}")
        return df
    except FileNotFoundError:
        print(f"❌ Error: Dataset not found at {path}. Please ensure the file exists and the path is correct.")
        return pd.DataFrame() # Return an empty DataFrame to indicate failure

# --- Data Preprocessing Functions ---
def train_test_impute_split(df: pd.DataFrame, label_col: str = "mortality_30d") -> tuple:
    """
    Splits the data into training and testing sets, and imputes missing values
    using medians calculated from the training set.

    Args:
        df (pd.DataFrame): The input DataFrame.
        label_col (str): The name of the target column.

    Returns:
        tuple: X_train, X_test, y_train, y_test DataFrames.
    """
    # Drop identifiers and timestamps
    df = df.drop(columns=['icustay_id', 'subject_id', 'hadm_id', 'admittime', 'dob', 'dod', 'intime', 'outtime', 'icd9_code'], errors='ignore')

    y = df[label_col]
    X = df.drop(columns=[label_col])

    # Split before imputation to prevent data leakage
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Impute missing values using training set statistics only
    for col in X_train.select_dtypes(include=np.number).columns:
        if X_train[col].isnull().any():
            median_val = X_train[col].median()
            X_train[col] = X_train[col].fillna(median_val)
            X_test[col] = X_test[col].fillna(median_val)
    
    print("✅ Data split and imputed successfully.")
    return X_train, X_test, y_train, y_test

def check_for_leakage(X: pd.DataFrame, y: pd.Series) -> pd.DataFrame:
    """
    Checks for potential data leakage by identifying highly correlated features
    with the target variable. Drops highly correlated columns if detected.

    Args:
        X (pd.DataFrame): Features DataFrame.
        y (pd.Series): Target Series.

    Returns:
        pd.DataFrame: Features DataFrame after dropping highly correlated columns
                      if leakage is detected.
    """
    X_copy = X.copy()
    y_copy = y.copy()
    # Reset indices to ensure correct concatenation for correlation calculation
    X_copy.index = range(len(X_copy))
    y_copy.index = range(len(y_copy))

    combined_df = pd.concat([X_copy, y_copy], axis=1)
    
    # Ensure all columns used for correlation are numeric
    numeric_cols = combined_df.select_dtypes(include=np.number).columns
    combined_df_numeric = combined_df[numeric_cols]

    if y.name not in combined_df_numeric.columns:
        print(f"⚠️ Warning: Target column '{y.name}' not found in numeric columns for leakage check. Skipping correlation check.")
        return X

    corr = combined_df_numeric.corr()[y.name].drop(y.name, errors='ignore') # drop target if it's there
    high_corr_threshold = 0.95
    high_corr = corr[abs(corr) > high_corr_threshold]

    if not high_corr.empty:
        print(f"\n⚠️ Potential Leakage Detected (correlation > {high_corr_threshold}):")
        print(high_corr)
        leaky_columns = high_corr.index.tolist()
        X = X.drop(columns=leaky_columns, errors='ignore')
        print(f"Dropped potential leakage columns: {leaky_columns}")
    else:
        print("\nNo significant data leakage detected based on high correlation.")
    return X

# --- Model Training and Evaluation ---
def train_logistic_regression(X_train: np.ndarray, y_train: pd.Series, 
                              X_test: np.ndarray, y_test: pd.Series) -> tuple:
    """
    Trains a Logistic Regression model and evaluates its performance.

    Args:
        X_train (np.ndarray): Scaled training features.
        y_train (pd.Series): Training labels.
        X_test (np.ndarray): Scaled testing features.
        y_test (pd.Series): Testing labels.

    Returns:
        tuple: Trained model, X_train, y_train, X_test, y_test, y_pred, y_prob.
    """
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    print("\n🧠 Classification Report:")
    print(classification_report(y_test, y_pred))
    print("\n📊 ROC AUC Score:", roc_auc_score(y_test, y_prob))
    print("✅ Logistic Regression model trained and evaluated.")
    return model, X_train, y_train, X_test, y_test, y_pred, y_prob

# --- SHAP Explanation and Plotting ---
def explain_single_patient_prediction(model, patient_data_scaled: np.ndarray, patient_data_df: pd.DataFrame, 
                                      output_dir: str, background_data_scaled: np.ndarray, patient_id: str = "single_patient"):
    """
    Generates a SHAP waterfall plot for a single patient's prediction.

    Args:
        model: The trained machine learning model.
        patient_data_scaled (np.ndarray): The scaled feature data for the single patient.
                                          Expected shape (1, n_features).
        patient_data_df (pd.DataFrame): The original (unscaled) feature data for the single patient,
                                        as a DataFrame. Used for feature names and context.
                                        Expected shape (1, n_features).
        output_dir (str): Directory to save the SHAP plot.
        background_data_scaled (np.ndarray): A representative background dataset (e.g., a sample of the
                                             scaled training data) for the explainer.
        patient_id (str): An identifier for the patient, used in the plot title and filename.
    """
    os.makedirs(output_dir, exist_ok=True)

    if patient_data_scaled.shape[0] != 1:
        print("Error: explain_single_patient_prediction expects data for a single patient.")
        return

    # Use a representative background dataset for the explainer
    # For Logistic Regression, LinearExplainer is efficient and accurate.
    # The background data should be from the training set.
    explainer = shap.LinearExplainer(model, background_data_scaled)
    shap_values = explainer.shap_values(patient_data_scaled)

    # SHAP waterfall expects a single Explanation object
    single_expl = shap.Explanation(
        values=shap_values[0],  # For a single prediction, shap_values is an array of arrays
        base_values=explainer.expected_value,
        data=patient_data_df.iloc[0].values,
        feature_names=patient_data_df.columns.tolist()
    )
    
    shap.plots.waterfall(single_expl, show=False)
    plt.title(f"SHAP Waterfall - {patient_id}")
    plt.savefig(os.path.join(output_dir, f"waterfall_{patient_id}.png"), bbox_inches='tight')
    plt.close()
    print(f"✅ SHAP waterfall plot for {patient_id} saved.")


def explain_predictions_batch(model, X_scaled: np.ndarray, X_df: pd.DataFrame, 
                              output_dir: str = "shap_plots", top_n: int = 10):
    """
    Generates SHAP plots for overall feature importance and top N high-risk patients.
    Suitable for Streamlit apps — saves figures as PNGs for display.
    This function is for batch explanation.
    """
    os.makedirs(output_dir, exist_ok=True)

    if X_scaled.shape[0] == 0:
        print("Skipping SHAP explanation: No data in X_scaled.")
        return

    # SHAP values via KernelExplainer or LinearExplainer depending on model
    # For a general explainer, using the model directly is often the most robust.
    # For Logistic Regression, we could use LinearExplainer with X_scaled as background if this were a batch explanation for the *entire* test set
    # but here we use the generic shap.Explainer for simplicity which handles model types.
    explainer = shap.Explainer(model, X_df) # X_df is used here to retain feature names
    shap_values = explainer(X_df)

    # Summary Plot (Overall)
    shap.summary_plot(shap_values, features=X_df, show=False)
    plt.savefig(os.path.join(output_dir, "shap_summary_overall.png"), bbox_inches='tight')
    plt.close()
    print("✅ SHAP summary plot saved.")

    # Dependence plots for top features
    # Ensure shap_values.values is not empty before attempting to access it
    if shap_values.values.size > 0:
        abs_mean = np.abs(shap_values.values).mean(axis=0)
        # Handle cases where number of features might be less than top_n
        top_feats_indices = np.argsort(abs_mean)[::-1][:min(top_n, len(X_df.columns))]
        
        for idx in top_feats_indices:
            feat = X_df.columns[idx]
            shap.dependence_plot(feat, shap_values.values, X_df, show=False)
            plt.title(f"SHAP Dependence: {feat}")
            plt.savefig(os.path.join(output_dir, f"dependence_{feat}.png"), bbox_inches='tight')
            plt.close()
        print("✅ SHAP dependence plots saved.")
    else:
        print("No SHAP values to plot for dependence plots.")


    # Get top N high-risk patients (by predicted probability)
    # Ensure X_scaled is not empty before predicting probabilities
    if X_scaled.shape[0] > 0:
        risk_scores = model.predict_proba(X_scaled)[:, 1]
        top_indices = np.argsort(risk_scores)[-top_n:][::-1]

        for i, idx in enumerate(top_indices):
            pid = f"patient_{idx}_rank_{i+1}"
            # SHAP waterfall expects a single Explanation object
            single_expl = shap.Explanation(
                values=shap_values.values[idx],
                base_values=shap_values.base_values[idx],
                data=X_df.iloc[idx],
                feature_names=X_df.columns
            )
            shap.plots.waterfall(single_expl, show=False)
            plt.title(f"SHAP Waterfall - {pid}")
            plt.savefig(os.path.join(output_dir, f"waterfall_{pid}.png"), bbox_inches='tight')
            plt.close()
        print(f"✅ SHAP waterfall plots for top {top_n} patients saved.")
    else:
        print("No patients to generate waterfall plots for.")

    return shap_values

# --- Model Saving Function ---
def save_model(model, scaler, output_path: str):
    """
    Saves the trained model and scaler to a joblib file.

    Args:
        model: The trained machine learning model.
        scaler: The fitted StandardScaler object.
        output_path (str): The path to save the model.
    """
    joblib.dump({"model": model, "scaler": scaler}, output_path)
    print(f"\n✅ Saved model and scaler to {output_path}")

# --- Main MLflow Execution Block ---
if __name__ == "__main__":
    # Set up MLflow tracking
    mlflow.set_experiment("OncoAI-Mortality-Prediction")

    # Start an MLflow run
    with mlflow.start_run() as run:
        run_id = run.info.run_id
        
        # Make SHAP plots output directory specific to the run
        run_shap_output_dir = os.path.join(shap_plots_base_path, run_id)
        os.makedirs(run_shap_output_dir, exist_ok=True)
        
        # Define the full model save path for this run
        model_save_path_for_run = os.path.join(model_save_base_path, f"logreg_model_run_{run_id}.joblib")

        print(f"Starting MLflow Run with ID: {run_id}")
        print(f"SHAP plots will be saved to: {run_shap_output_dir}")
        print(f"Model will be saved to: {model_save_path_for_run}")

        # 1. Load dataset
        df = load_dataset()

        if df.empty: # Check if DataFrame is empty
            print("❌ Dataset is empty. Cannot proceed with training and explanation. Exiting MLflow run.")
            mlflow.end_run(status="FAILED")
        else:
            # 2. Train-test split and imputation
            X_train, X_test, y_train, y_test = train_test_impute_split(df)

            # 3. One-hot encode categorical columns
            X_train_ohe = pd.get_dummies(X_train, drop_first=True)
            X_test_ohe = pd.get_dummies(X_test, drop_first=True)

            # Align columns after one-hot encoding to ensure same features in train/test
            missing_cols_in_test = set(X_train_ohe.columns) - set(X_test_ohe.columns)
            for c in missing_cols_in_test:
                X_test_ohe[c] = 0
            # Ensure the order of columns is the same
            X_test_ohe = X_test_ohe[X_train_ohe.columns]

            # 4. Check for data leakage on the one-hot encoded training data
            X_train_leakage_checked = check_for_leakage(X_train_ohe, y_train)

            # Apply the same column selection (after leakage check) to the test set
            X_test_leakage_checked = X_test_ohe[X_train_leakage_checked.columns]

            # DEBUG: Print column names to verify before scaling and SHAP
            print(f"\nDEBUG: Columns of X_train_leakage_checked before scaling:\n{X_train_leakage_checked.columns.tolist()}")
            print(f"DEBUG: Columns of X_test_leakage_checked before scaling:\n{X_test_leakage_checked.columns.tolist()}")
            
            # 5. Scale features
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train_leakage_checked)
            X_test_scaled = scaler.transform(X_test_leakage_checked)

            print("✅ Features prepared (one-hot encoded and scaled).")

            # Log parameters to MLflow
            mlflow.log_param("scaler", "StandardScaler")
            mlflow.log_param("model_type", "LogisticRegression")

            # 6. Train Logistic Regression model
            model, X_train_final_scaled, y_train_final, X_test_final_scaled, y_test_final, y_pred, y_prob = \
                train_logistic_regression(X_train_scaled, y_train, X_test_scaled, y_test)

            # Log ROC AUC metric to MLflow
            auc = roc_auc_score(y_test_final, y_prob)
            mlflow.log_metric("roc_auc", auc)

            # 7. Save model and scaler
            save_model(model, scaler, output_path=model_save_path_for_run)

            # 8. Log model with input_example for signature inference
            if X_train_leakage_checked.shape[0] > 0:
                mlflow.sklearn.log_model(model, "logreg_model", 
                                         input_example=X_train_leakage_checked.head(10))
            else:
                mlflow.sklearn.log_model(model, "logreg_model") 

            # 9. Explain with SHAP for batch (overall and top N high-risk patients)
            # This function remains for overall model explanation and high-risk patients.
            explain_predictions_batch(model, X_test_scaled, X_test_leakage_checked, output_dir=run_shap_output_dir, top_n=10)

            # NEW: Explain with SHAP for a single patient
            # Select an arbitrary patient from the test set for demonstration
            if X_test_scaled.shape[0] > 0:
                single_patient_idx = 0  # Explain the first patient in the test set
                single_patient_scaled_data = X_test_scaled[single_patient_idx].reshape(1, -1)
                single_patient_df = X_test_leakage_checked.iloc[[single_patient_idx]]
                
                # Pass a sample of the training data as background for the explainer
                # It's good practice to use a smaller, representative sample for performance.
                # For LinearExplainer, the mean of the training data is also a valid background.
                # Here, we'll use a small sample of X_train_scaled.
                background_sample_size = min(100, X_train_scaled.shape[0]) # Use up to 100 samples
                background_data_for_shap = X_train_scaled[np.random.choice(X_train_scaled.shape[0], background_sample_size, replace=False)]

                explain_single_patient_prediction(model, single_patient_scaled_data, 
                                                  single_patient_df, run_shap_output_dir, 
                                                  background_data_for_shap, # Pass the background data
                                                  patient_id=f"test_patient_{single_patient_idx}")
            else:
                print("No test data available to explain a single patient.")

            # 10. Log SHAP plots as MLflow artifacts
            shap_plot_files = [f for f in os.listdir(run_shap_output_dir) if f.endswith('.png')]
            for plot_file in shap_plot_files:
                mlflow.log_artifact(os.path.join(run_shap_output_dir, plot_file), artifact_path="shap_plots")

    print("\n✨ MLflow run completed successfully. Check your MLflow UI for details.")




Starting MLflow Run with ID: ae10757d5eb94eb681115211fb918898
SHAP plots will be saved to: /Users/sangeethgeorge/MyProjects/oncoai-patient-outcome-navigator/shap_plots/ae10757d5eb94eb681115211fb918898
Model will be saved to: /Users/sangeethgeorge/MyProjects/oncoai-patient-outcome-navigator/models/logreg_model_run_ae10757d5eb94eb681115211fb918898.joblib
✅ Dataset loaded successfully from /Users/sangeethgeorge/MyProjects/oncoai-patient-outcome-navigator/data/onco_features_cleaned.parquet
✅ Data split and imputed successfully.

No significant data leakage detected based on high correlation.

DEBUG: Columns of X_train_leakage_checked before scaling:
['mean_mchc', 'min_heart_rate', 'max_bicarbonate', 'mean_urea_nitrogen', 'age', 'min_white_blood_cells', 'mean_chloride', 'max_mchc', 'min_urea_nitrogen', 'mean_glucose']
DEBUG: Columns of X_test_leakage_checked before scaling:
['mean_mchc', 'min_heart_rate', 'max_bicarbonate', 'mean_urea_nitrogen', 'age', 'min_white_blood_cells', 'mean_chlorid



✅ SHAP summary plot saved.
✅ SHAP dependence plots saved.
✅ SHAP waterfall plots for top 10 patients saved.
✅ SHAP waterfall plot for test_patient_0 saved.

✨ MLflow run completed successfully. Check your MLflow UI for details.
