### Helpers in case of version incompatibilities

## Setup of pipeline

### Import

In [None]:
%pip install -e "../../modeling_pipeline/src"

%pip install -r requirements.txt

In [None]:
import sys
sys.path.append("../../modeling_pipeline") #Because the project is in a different folder (two levels up), we need to add the path to the sys path
sys.path.append("../..")                    #Basically just to reduce error messages rn
%load_ext autoreload
%autoreload 2

from pipeline import *
import os

plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['DejaVu Sans'] #change font to a known standard font

with open('user_input.yaml') as file:
    user_input = yaml.load(file, Loader=yaml.FullLoader)

## Load All Of us Dataframe

In [None]:
DOI = user_input["DOI"]
orig_path = '/home/jupyter/workspaces/machinelearningforlivercancerriskprediction/'

In [None]:
path = '/home/jupyter/workspaces/machinelearningforlivercancerriskprediction/'
orig_path = '/home/jupyter/workspaces/machinelearningforlivercancerriskprediction/'
DOI = user_input["DOI"]

date = "06_08_2025"
subset = "all" #all or par

data_path = os.path.join(path, "data", date)

X_file_path = os.path.join(data_path, f"X_outer_basic_{subset}.csv")
y_file_path = os.path.join(data_path, f"y_outer_basic_{subset}.csv")

# Read the CSV files
X_val_df = pd.read_csv(X_file_path)
y_val_df = pd.read_csv(y_file_path)
X_val_df['split_int'] = 0 #necessary for current pipeline version, because expects split_int

# Print info about the loaded dataframes
print(f"X_val_df shape: {X_val_df.shape}")
print(f"y_val_df shape: {y_val_df.shape}")

# Optionally, display the first few rows of each dataframe
print("\nFirst few rows of X_val_df:")
print(X_val_df.head())
print("\nFirst few rows of y_val_df:")
print(y_val_df.head())

mask = y_val_df["status"] != 2

# Filter both X and y
X_val_df = X_val_df.loc[mask].copy()
y_val_df = y_val_df.loc[mask].copy()

# Print info about the loaded dataframes
print(f"X_val_df shape: {X_val_df.shape}")
print(f"y_val_df shape: {y_val_df.shape}")


### PAR cohort

In [None]:
path = '/home/jupyter/workspaces/machinelearningforlivercancerriskprediction/'
orig_path = '/home/jupyter/workspaces/machinelearningforlivercancerriskprediction/'
DOI = user_input["DOI"]

date = "06_08_2025"
subset = "par" #all or par

data_path = os.path.join(path, "data", date)

X_file_path = os.path.join(data_path, f"X_outer_basic_{subset}.csv")
y_file_path = os.path.join(data_path, f"y_outer_basic_{subset}.csv")

# Read the CSV files
X_val_df = pd.read_csv(X_file_path)
y_val_df = pd.read_csv(y_file_path)
X_val_df['split_int'] = 0 #necessary for current pipeline version, because expects split_int

# Print info about the loaded dataframes
print(f"X_val_df shape: {X_val_df.shape}")
print(f"y_val_df shape: {y_val_df.shape}")

# Optionally, display the first few rows of each dataframe
print("\nFirst few rows of X_val_df:")
print(X_val_df.head())
print("\nFirst few rows of y_val_df:")
print(y_val_df.head())

mask = y_val_df["status"] != 2

# Filter both X and y
X_val_df = X_val_df.loc[mask].copy()
y_val_df = y_val_df.loc[mask].copy()

# Print info about the loaded dataframes
print(f"X_val_df shape: {X_val_df.shape}")
print(f"y_val_df shape: {y_val_df.shape}")


#### Subset Population for bias investigation

In [None]:
X_val_df
y_val_df

#### Load model oject

In [None]:


model_name = f'Validation_Objects/Pipeline_{DOI}_{subset}_Model_TOP15_RFC_external_val.joblib'
full_path = os.path.join(path, model_name)
model_path = []
ext_val = load(full_path) # Load the file
print(f"Loading file from: {full_path}")

#### Initialize pipeline

In [None]:
pl_ext={}
pl_ext=Pipeline(ext_val_obj=ext_val) #Initialize pipeline object
pl_ext.external_validation(X_val=X_val_df,y_val=y_val_df)  # Load ext_data into pipeline

# Setup filepaths
pl_ext.user_input.path = '/home/jupyter/workspaces/machinelearningforlivercancerriskprediction/'
pl_ext.user_input.fig_path = pl_ext.user_input.path + "/HCC/visuals"
pl_ext.pipeline_output_path = '/home/jupyter/workspaces/machinelearningforlivercancerriskprediction/'
pl_ext.user_input.model_path = pl_ext.pipeline_output_path
pl_ext.user_input.target_to_validate_on = "status" #Import to change this especially if in training was validated on status_cancerreg

### Apply OHE Transformation

In [None]:
pl_ext.ohe.transform(pl_ext.data.X_val) #Apply one-hot encoder
#pl_ext.data.X_ohe_df.head()

In [None]:
pl_ext.user_input.row_subset

## Apply Loop "All" (first evaluation)

In [None]:
# Import dataframe before (df does not change for models, so no integration in loop)
models = ["TOP15"]
row_subsets = ["all"]
estimators = ["RFC", "CatBoost"]
path = '/home/jupyter/workspaces/machinelearningforlivercancerriskprediction/'


for model in models:
    for row_subset in row_subsets:
        for estimator in estimators:
            #Load model object

            model_name = f'Validation_Objects/Pipeline_{DOI}_{row_subset}_Model_{model}_{estimator}_external_val.joblib'
            full_path = os.path.join(path, model_name)
            ext_val = load(full_path) # Load the file
            print(f"Loading file from: {full_path}")


            #Initialize pipeline
            pl_ext={} #reset pipeline before creating new one
            pl_ext=Pipeline(ext_val_obj=ext_val) #Initialize pipeline object
            pl_ext.external_validation(X_val=X_val_df,y_val=y_val_df)  # Load ext_data into pipeline

            # Setup filepaths
            pl_ext.model_type = estimator #before, this will be "not_trained" -> This is needed as long as ext val objects do not get a true model_type assigned
            pl_ext.user_input.path = path
            pl_ext.user_input.fig_path = pl_ext.user_input.path + "visuals"
            pl_ext.pipeline_output_path = path
            pl_ext.user_input.model_path = pl_ext.pipeline_output_path
            pl_ext.user_input.target_to_validate_on = "status"

            #Apply one-hot encoder
            pl_ext.ohe.transform(pl_ext.data.X_val)

            # Create master_rfc
            pl_ext.build_master_RFC()

            #Initialize eval class
            pl_ext.evaluation(only_val=True)

            #Export evaluation
            pl_ext.save_values_for_validation()

            #Save Pipeline object for future reference
            pl_ext.save_Pipeline()


print("Loop finished")

## Apply First LOOP "PSC_CCA"

In [None]:
# Import dataframe before (df does not change for models, so no integration in loop)
models = ["TOP15"]
row_subsets = ["all"]
estimators = ["RFC", "CatBoost"]
path = '/home/jupyter/workspaces/machinelearningforlivercancerriskprediction/'


for model in models:
    for row_subset in row_subsets:
        for estimator in estimators:
            #Load model object

            model_name = f'Validation_Objects/Pipeline_{DOI}_{row_subset}_Model_{model}_{estimator}_external_val.joblib'
            full_path = os.path.join(path, model_name)
            ext_val = load(full_path) # Load the file
            print(f"Loading file from: {full_path}")


            #Initialize pipeline
            pl_ext={} #reset pipeline before creating new one
            pl_ext=Pipeline(ext_val_obj=ext_val) #Initialize pipeline object
            pl_ext.external_validation(X_val=X_val_df,y_val=y_val_df)  # Load ext_data into pipeline

            # Setup filepaths
            pl_ext.model_type = estimator #before, this will be "not_trained" -> This is needed as long as ext val objects do not get a true model_type assigned
            pl_ext.user_input.path = path
            pl_ext.user_input.fig_path = pl_ext.user_input.path + "visuals"
            pl_ext.pipeline_output_path = path
            pl_ext.user_input.model_path = pl_ext.pipeline_output_path
            pl_ext.user_input.target_to_validate_on = "status"

            #Apply one-hot encoder
            pl_ext.ohe.transform(pl_ext.data.X_val)

            # Create master_rfc
            pl_ext.build_master_RFC()

            #Initialize eval class
            pl_ext.evaluation(only_val=True)

            #Export evaluation
            pl_ext.save_values_for_validation()

            #Save Pipeline object for future reference
            pl_ext.save_Pipeline()


print("Loop finished")

In [None]:
pl_ext.summarize_data()

In [None]:
pl_ext.model_type

In [None]:
summarize_df(
    pl_ext.data.X,
    pl_ext.data.y,
    getattr(pl_ext.data, "X_val", None),
    getattr(pl_ext.data, "y_val", None),
    pl_ext.user_input.col_subset,
    pl_ext.user_input.row_subset,
    pl_ext.user_input.DOI
)

In [None]:
#debug

pl_ext.eval.test_train_pred['val']

## Apply Loop (from second evaluation)

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_curve, average_precision_score
import joblib
import os


def export_interpolated_pr_curves(pl_ext, path=None, cohort='val', save_format="joblib", biobank=None):
    """
    Export interpolated precision-recall curves for external validation using the
    test_train_pred DataFrame, similar to your TPRS_combined.joblib approach.

    Parameters:
    -----------
    pl_ext : pipeline object
        Your pipeline object containing eval.test_train_pred data
    cohort : str
        Which dataset to export ('train', 'test', 'val')
    save_format : str
        File format: 'joblib', 'csv', or 'xlsx'

    Returns:
    --------
    pd.DataFrame : DataFrame containing interpolated precision-recall data
    """

    print(f"Starting export_interpolated_pr_curves for {cohort} cohort using format: {save_format}")

    # Get the DataFrame from test_train_pred
    if cohort not in pl_ext.eval.test_train_pred:
        print(f"{cohort} data not available in test_train_pred")
        return pd.DataFrame()

    df = pl_ext.eval.test_train_pred[cohort].copy()
    print(f"DataFrame shape: {df.shape}")
    print(f"DataFrame columns: {df.columns.tolist()}")

    # Define recall range (100 points from 0 to 1, as commonly used)
    recall_base = np.linspace(0, 1, 100)

    # Initialize DataFrame to store interpolated precisions
    prc_interpolated = pd.DataFrame()

    # Get model prediction columns (y_pred_val_model_0, y_pred_val_model_1, etc.)
    model_cols = [col for col in df.columns if col.startswith('y_pred')]

    # If no specific model columns, use the main y_pred column
    if not model_cols and 'y_pred' in df.columns:
        model_cols = ['y_pred']

    # True labels - assuming 'status' is the target variable
    if 'status' not in df.columns:
        print("Warning: 'status' column not found. Available columns:", df.columns.tolist())
        return pd.DataFrame()

    y_true = df['status']

    print(f"Found {len(model_cols)} model prediction columns: {model_cols}")

    # Process each model's predictions
    for model_col in model_cols:
        if model_col in df.columns:
            y_pred = df[model_col]

            # Remove any NaN values
            valid_mask = ~(pd.isna(y_true) | pd.isna(y_pred))
            y_true_clean = y_true[valid_mask]
            y_pred_clean = y_pred[valid_mask]

            if len(y_true_clean) == 0:
                print(f"Warning: No valid data for {model_col}")
                continue

            # Calculate precision-recall curve
            precision, recall, thresholds = precision_recall_curve(y_true_clean, y_pred_clean)

            # Sort by recall (ascending) for proper interpolation
            sorted_indices = np.argsort(recall)
            recall_sorted = recall[sorted_indices]
            precision_sorted = precision[sorted_indices]

            # Interpolate precision at the base recall points
            precision_interp = np.interp(recall_base, recall_sorted, precision_sorted)

            # Store in DataFrame with appropriate column name
            col_name = model_col.replace('y_pred_val_model_', 'model_').replace('y_pred', 'model_mean')
            prc_interpolated[col_name] = precision_interp

            print(f"Processed {model_col} -> {col_name}")

    if prc_interpolated.empty:
        print("No valid model predictions found")
        return pd.DataFrame()

    # Save using the same structure as save_performance_combination

    row_subset = pl_ext.user_input.row_subset
    col_subset = pl_ext.user_input.col_subset
    estimator = pl_ext.model_type
    identifier = f"{biobank}_{row_subset}_{col_subset}_{estimator}"



    #Either save the PRC in the pipeline specific output path or in the path given by the user
    if path is None:
        combined_output_path = os.path.join(
            pl_ext.pipeline_output_path,
            f"combined_output/{cohort}"
    )

    else:
          combined_output_path = path

    os.makedirs(combined_output_path, exist_ok=True)

    # Rename columns with identifier (same pattern as TPRS)
    prc_export = prc_interpolated.rename(columns=lambda x: f"{identifier}_{str(x)}")

    if save_format == "joblib":
        prc_combined_path = os.path.join(combined_output_path, "PRC_combined.joblib")

        # Load existing file or create new DataFrame
        if os.path.exists(prc_combined_path):
            prc_combined = joblib.load(prc_combined_path)
        else:
            prc_combined = pd.DataFrame()

        # Remove any existing columns with the same identifier
        prc_combined = prc_combined.drop(columns=[col for col in prc_combined.columns
                                                  if col.startswith(f"{identifier}_")], errors='ignore')

        # Add new data
        prc_combined = pd.concat([prc_combined, prc_export], axis=1)

        # Save
        joblib.dump(prc_combined, prc_combined_path)
        print(f"PRC data saved to {prc_combined_path}")

    else:
        # Handle CSV/Excel formats
        prc_combined_path = os.path.join(combined_output_path, "PRC_combined.xlsx")

        if os.path.exists(prc_combined_path):
            if save_format == "csv":
                prc_combined = pd.read_csv(prc_combined_path.replace(".xlsx", ".csv"))
            else:
                prc_combined = pd.read_excel(prc_combined_path)
        else:
            prc_combined = pd.DataFrame()

        # Remove existing columns with same identifier
        current_cols = [col for col in prc_combined.columns if col.startswith(f"{identifier}_")]
        if current_cols:
            prc_combined = prc_combined.drop(columns=current_cols)

        # Add new data
        prc_combined_export = pd.concat([prc_combined, prc_export], axis=1)

        # Save in requested format
        if save_format == "csv":
            csv_path = prc_combined_path.replace(".xlsx", ".csv")
            prc_combined_export.to_csv(csv_path, index=False)
            print(f"PRC data exported to {csv_path}")
        else:
            prc_combined_export.to_excel(prc_combined_path, index=False)
            print(f"PRC data exported to {prc_combined_path}")

    print(f"Exported PRC data shape: {prc_export.shape}")
    return prc_export

In [None]:
pl_ext.data.X_val[pl_ext.data.X_val["SEX"]==1].info()

In [None]:
export_interpolated_pr_curves(pl_ext, path= os.path.join(orig_path, "combined_output"), cohort='val', biobank="AOU")

## Evaluation

### Create "trained Model"

In [None]:
pl_ext.build_master_RFC()

In [None]:
pl_ext.user_input.target_to_validate_on

### Start Evaluation

In [None]:
pl_ext.evaluation(only_val=True) #initialize eval class

In [None]:
pl_ext.evaluation_summary_independent()
pl_ext.evaluation_summary_threshold_dependent(thresholds=np.arange(0.7, 0.29, -0.02), beta=10)

### Export model metrics for plots + export pipeline object

In [None]:
pl_ext.save_values_for_validation()
pl_ext.save_Pipeline()

In [None]:
import openpyxl
from pprint import pprint

def get_excel_metadata(file_path):
    """
    Extract metadata from an Excel file.

    Args:
    file_path (str): Path to the Excel file

    Returns:
    dict: Metadata including number of sheets and column names for each sheet
    """
    metadata = {
        "file_path": file_path,
        "number_of_sheets": 0,
        "sheets": {}
    }

    try:
        # Load the workbook
        workbook = openpyxl.load_workbook(file_path, read_only=True)

        # Get the number of sheets
        metadata["number_of_sheets"] = len(workbook.sheetnames)

        # Iterate through each sheet
        for sheet_name in workbook.sheetnames:
            sheet = workbook[sheet_name]

            # Get column names (assuming first row contains headers)
            column_names = [cell.value for cell in next(sheet.iter_rows(min_row=1, max_row=1))]

            # Add sheet info to metadata
            metadata["sheets"][sheet_name] = {
                "column_names": column_names,
                "number_of_columns": len(column_names)
            }

    except Exception as e:
        print(f"An error occurred: {str(e)}")

    return metadata

# File path
file_path = "/home/jupyter/workspaces/machinelearningforlivercancerriskprediction/combined_output/val/Prediction_values_combined.xlsx"

# Get metadata
excel_metadata = get_excel_metadata(file_path)

# Print metadata
print("Excel File Metadata:")
pprint(excel_metadata)

### Create summary tables for model metrics (1/2)

In [None]:
def round_columns(df, columns, precision):
    df[columns] = df[columns].applymap(lambda x: round(x, precision))
    return df


columns_precision = {
                'Mean': 3,
                'Std. Dev.': 3,
                'Fold 1': 3,
                'Fold 2': 3,
                'Fold 3': 3,
                'Fold 4': 3,
                'Fold 5': 3
            }

all_evaluation_results = pd.DataFrame()  # For storing all evals
row_subsets = ["par", "all"]
col_subsets = ['Model_A', 'Model_B', 'Model_C', 'Model_D', 'Model_E']

for row_subset in row_subsets:
    for col_subset in col_subsets:
        pl = load_Pipeline(path+ f'/Models/Pipelines/RFC/Pipeline_HCC_{row_subset}_{col_subset}_RFC.joblib')
        pl.evaluation()
        aucs = pl.eval.val["aucs"]
        auprcs = pl.eval.val["auprcs"]
        mean_auc = np.mean(aucs)
        std_auc = np.std(aucs)
        mean_auprc = np.mean(auprcs)
        std_auprc = np.std(auprcs)

        # Create evaluation table for AUROC
        evaluation_table_auroc = pd.DataFrame({
            'Model': [col_subset],
            'Dataset': [row_subset],
            'Metric': ['AUROC'],
            'Mean': [mean_auc],
            'Std. Dev.': [std_auc],
            'Fold 1': [aucs[0]],
            'Fold 2': [aucs[1]],
            'Fold 3': [aucs[2]],
            'Fold 4': [aucs[3]],
            'Fold 5': [aucs[4]]

        })

        # Create evaluation table for AUPRC
        evaluation_table_auprc = pd.DataFrame({
            'Model': [col_subset],
            'Dataset': [row_subset],
            'Metric': ['AUPRC'],
            'Mean': [mean_auprc],
            'Std. Dev.': [std_auprc],
            'Fold 1': [auprcs[0]],
            'Fold 2': [auprcs[1]],
            'Fold 3': [auprcs[2]],
            'Fold 4': [auprcs[3]],
            'Fold 5': [auprcs[4]]

        })

        # Concatenate the results
        all_evaluation_results = pd.concat([all_evaluation_results, evaluation_table_auroc, evaluation_table_auprc], ignore_index=True)

        for column, precision in columns_precision.items():
                all_evaluation_results = round_columns(all_evaluation_results, [column], precision)

print(all_evaluation_results)

output_path = path+ f'/Models/Pipelines/RFC/combined_output/val/all_evaluation_results.xlsx'
with pd.ExcelWriter(output_path, mode='a') as writer:
    all_evaluation_results.to_excel(writer, sheet_name='Independent metrics', index=False)


### Summary model metrics (threshold-dependent) (2/2)

In [None]:
from sklearn.metrics import classification_report, f1_score, accuracy_score, average_precision_score, precision_score, recall_score, balanced_accuracy_score, confusion_matrix, fbeta_score

def round_columns(df, columns, precision):
    df[columns] = df[columns].applymap(lambda x: round(x, precision))
    return df

columns_precision = {
                'Precision': 3,
                'Recall': 2,
                'Accuracy': 2,
                'F1 Score': 3,
                'F-beta Score': 3,
                'Balanced Accuracy': 2,
                'PPV': 4,
                'NPV': 4
            }
row_subsets = ["par", "all"]
col_subsets = ['Model_C']
beta = 10


thresholds = np.arange(0.6, 0.29, -0.01)
threshold_evaluation_results = pd.DataFrame()

for row_subset in row_subsets:
    for col_subset in col_subsets:
        pl = load_Pipeline(path + f'/Models/Pipelines/RFC/Pipeline_HCC_{row_subset}_{col_subset}_RFC.joblib')
        pl.evaluation()

        for threshold in thresholds:

            proba = pl.master_RFC.predict_proba(pl.ohe.transform(pl.data.X_val)) # Predictions at the given threshold


            print(f"Prediction probabilities (first 5): {proba[:5]}") # Debug print

            if proba.ndim == 1:
                y_pred = (proba >= threshold).astype(int)
            else:
                y_pred = (proba[:, 1] >= threshold).astype(int)

            y_true = pl.data.z_val["status_cancerreg"]


            # Calculate metrics
            precision = precision_score(y_true, y_pred)
            recall = recall_score(y_true, y_pred)
            accuracy = accuracy_score(y_true, y_pred)
            f1 = f1_score(y_true, y_pred)
            f_beta = fbeta_score(y_true, y_pred, beta=beta)
            balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
            tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
            ppv = tp / (tp + fp)  # Same as precision
            npv = tn / (tn + fn)

            # Create evaluation table for the threshold
            evaluation_table_threshold = pd.DataFrame({
                'Model': [col_subset],
                'Dataset': [row_subset],
                'Threshold': [threshold],
                'Precision': [precision],
                'Recall': [recall],
                'Accuracy': [accuracy],
                'F1 Score': [f1],
                f'F-beta Score (beta={beta})': [f_beta],
                'Balanced Accuracy': [balanced_accuracy],
                'PPV': [ppv],
                'NPV': [npv]
            })

            # Concatenate the results
            threshold_evaluation_results = pd.concat([threshold_evaluation_results, evaluation_table_threshold], ignore_index=True)

            for column, precision in columns_precision.items():
                threshold_evaluation_results = round_columns(threshold_evaluation_results, [column], precision)


output_path = path+ f'/Models/Pipelines/RFC/combined_output/val/all_evaluation_results.xlsx'
with pd.ExcelWriter(output_path, mode='a') as writer: #append second sheet (a). If exporting this first, use (w)
    threshold_evaluation_results.to_excel(writer, sheet_name='Threshold metrics_all_thresholds', index=False)


## Visualizations

### Load pipeline

In [None]:
from pipeline import * #Load our package with classes pipeline, models, pp (preprocessing), plot, and more





path = '/home/jupyter/workspaces/machinelearningforlivercancerriskprediction/'
pl_ext=load_Pipeline(path + "/Pipelines/CatBoost/Pipeline_HCC_par_Model_TOP15_RFC.joblib") #Change for pipeline you want


In [None]:
# Append Ethnicity info for subanalysis

df_ethnicity_boolean = pd.read_csv(os.path.join(path, "HCC/df_ethnicity_boolean.csv"))
pl_ext.data.X_val = pl_ext.data.X_val.merge(df_ethnicity_boolean, on="eid", how="inner")

# Print info about the resulting DataFrame
print(pl_ext.data.X_val.shape)
print(pl_ext.data.X_val.columns)

In [None]:
# Append timepoint of diagnosis for Kaplan Meier curve



### Figure Loop

In [None]:
models = ["TOP10", "TOP5"]
row_subsets = ["all"]
estimators = ["RFC", "CatBoost"]


for model in models:
    for row_subset in row_subsets:
        for estimator in estimators:
            #Load model object
            path = f'/home/jupyter/workspaces/machinelearningforlivercancerriskprediction/Pipelines/{estimator}'
            model_name = f'Pipeline_{DOI}_{row_subset}_Model_{model}_{estimator}.joblib'
            full_path = os.path.join(path, model_name)
            print(f"Loading pipeline object from: {full_path}")
            pl_ext=load_Pipeline(full_path)
            pl_ext.model_type = estimator #before, this will be "not_trained" -> This is needed as long as ext val objects do not get a true model_type assigned
            plot.wrapper_eval_prediction_mono(pip_self=pl_ext,X=pl_ext.data.X_val,y_true=pl_ext.data.y_val,model=pl_ext.master_RFC,thresholds=[0.48],figsize=(15,10), font_size=22, export=True)
            #plot.wrapper_eval_prediction_mono(pip_self=pl_ext,X=pl_ext.data.X_val,y_true=pl_ext.data.y_val,model=pl_ext.master_RFC,thresholds=[0.55, 0.45, 0.35, 0.6, 0.5, 0.4],figsize=(15,10), font_size=22, export=True)

print("Loop finished")

### Figures

In [None]:
pl_ext.user_input.fig_path

In [None]:
pl_ext.feature_imp_barplot(n_features=50)

In [None]:
pl_ext.data.y_val

In [None]:
pl_ext.shap_analysis(sample_size=10000, max_display=15, fig_size=(12, 6))

In [None]:
pl.data.z_val.status_cancerreg

In [None]:
pl_ext.roc_auc_test_train()

In [None]:
plot.save_colorbar(pip_self=pl, figsize=(0.5, 6), font_size=22)

In [None]:
plot.wrapper_eval_prediction_mono(pip_self=pl_ext,X=pl_ext.data.X_val,y_true=pl_ext.data.y_val,model=pl_ext.master_RFC,thresholds=[0.55, 0.45, 0.35, 0.6, 0.5, 0.4],figsize=(15,10), font_size=22, export=True)

In [None]:
plot.wrapper_eval_prediction_multi(pip_self=pl_ext,X=pl_ext.data.X_val,y_true=pl_ext.data.y_val,model=pl_ext.master_RFC,thresholds=[(0.4, 0.6), (0.35, 0.55)],incorp_threh_in_y_label=True,figsize=(13,5), n_rows=1, font_size=22, export=True)

In [None]:
plot.create_violin_plot(pl_ext,pl_ext.data.X_val,pl_ext.data.y_val,model=pl_ext.master_RFC,ohe=pl_ext.ohe, gap=-0.1, width=0.8, thresholds_choice=[0,.35,.55,1])

In [None]:
def plot_KM(pl,thresholds=(0.35, 0.5),color_dict={'Low Risk':'green','Medium Risk':'yellow','High Risk':'red'}, x_scale="y", y_scale="default", font_size="22"):

    import pandas as pd
    import numpy as np
    from lifelines import KaplanMeierFitter
    import matplotlib.pyplot as plt

    def kaplan_meier_analysis(time, event, group, x_scale= x_scale, group_labels=None, plot=True,color_dict=color_dict, font_size=font_size, y_scale=y_scale):
        """
        Perform Kaplan-Meier analysis and compare survival curves between two groups.,
        Parameters:,
        - time: Array-like object containing the time to event or censoring.,
        - event: Array-like object indicating whether an event occurred (1) or not (0).,
        - group: Array-like object specifying the group each observation belongs to (e.g., treatmentcontrol).,
        - group_labels: List of labels for the two groups (optional).,
         -x-scale (y, m, d)
        - plot: Boolean indicating whether to plot the survival curves (default is True).,
        Returns:,
        - kmf: KaplanMeierFitter object containing the survival estimates for each group.,
        """
        # Creating a DataFrame from the provided data,
        df = pd.DataFrame({'time': list(time), 'event': list(event), 'group': list(group)})
        # Initializing KaplanMeierFitter,
        kmf = KaplanMeierFitter()
        # Group-wise analysis,
        for i, grp in enumerate([i for i in df['group'].unique()]):
            data = df.loc[df['group'] == grp,:]
            kmf.fit(data['time'], event_observed=data['event'], label=group_labels[i] if group_labels else f'{grp}')
            if plot:
                kmf.plot(color=color_dict.get(grp,'green'),alpha=0.7)

        if plot:
            if x_scale == 'y':
                plt.xlabel('Time [Years]', fontsize=font_size)
            elif x_scale == 'm':
                plt.xlabel('Time [Months]', fontsize=font_size)
            else:
                plt.xlabel('Time [Days]', fontsize=font_size)
            plt.ylabel(f'1 - Probability of {pl.user_input.DOI} [%]', fontsize=font_size),
            plt.yticks(fontsize=font_size)
            plt.xticks(fontsize=font_size)
            plt.title(f'Time to {pl.user_input.DOI} per risk group'),
            plt.legend(frameon=False, fontsize=font_size, loc="lower left"),
            if y_scale != 'default':
                plt.ylim(y_scale)

            plt.tight_layout()
            plt.show()
        return kmf

    def get_group(pred_prob,thresholds):
        groups=[]
        for i in pred_prob:
            if i <thresholds[0]:
                groups.append('Low Risk')
            elif i>=thresholds[0] and i<thresholds[1]:
                groups.append('Medium Risk')
            elif i>=thresholds[1]:
                groups.append('High Risk')
        return groups

    time_censoring=pd.Timestamp(year=2024,day=1,month=1)
    time_censoring
    z_val=pl.data.z_val


    z_val.loc[z_val.date_of_diag.isna(),'date_of_diag']=time_censoring
    timedelta=pd.to_datetime(z_val.date_of_diag)-(pd.to_datetime(z_val['Date of assessment']))
    z_val['time_to_event_d']= [i.days for i in timedelta]
    z_val['time_to_event_m']= z_val['time_to_event_d'] /30
    z_val['time_to_event_y']= z_val['time_to_event_d'] /365.25
    z_val["pred_prob"]=pl.master_RFC.predict_proba(pl.ohe.transform(pl.data.X_val)).values


    z_val['risk_group']=get_group(z_val.pred_prob,thresholds=thresholds)
    print('The Thresholds are:',thresholds)
    print(z_val.risk_group.value_counts())
    fig,ax=plt.subplots(figsize=(10,10))
    time_column = f'time_to_event_{x_scale}'
    estimator=kaplan_meier_analysis(z_val[time_column], event=z_val.status,group=z_val.risk_group)
    estimator.plot_survival_function()

    svg_path = os.path.join(pl.user_input.fig_path, f"KaplanMeier_{pl.user_input.col_subset}_{pl.user_input.row_subset}_{y_scale}.svg")
    fig.savefig(svg_path, format='svg', bbox_inches='tight', transparent=True)

    return estimator

### Sub-Analysis

In [None]:
# Conf Matrices Male
plot.wrapper_eval_prediction_multi(pip_self=pl_ext,X=pl_ext.data.X_val,y_true=pl_ext.data.y_val,model=pl_ext.master_RFC,thresholds=[(0.4, 0.6), (0.35, 0.55)],incorp_threh_in_y_label=True,figsize=(13,5), n_rows=1, font_size=22, stratify={'column': 'SEX', 'value': 1})

In [None]:
# Conf Matrices FEmale
plot.wrapper_eval_prediction_multi(pip_self=pl_ext,X=pl_ext.data.X_val,y_true=pl_ext.data.y_val,model=pl_ext.master_RFC,thresholds=[(0.4, 0.6), (0.35, 0.55)],incorp_threh_in_y_label=True,figsize=(13,5), n_rows=1, font_size=22, stratify={'column': 'SEX', 'value': 0})

In [None]:
# Conf Matrices White
plot.wrapper_eval_prediction_multi(pip_self=pl_ext,X=pl_ext.data.X_val,y_true=pl_ext.data.y_val,model=pl_ext.master_RFC,thresholds=[(0.4, 0.6), (0.35, 0.55)],incorp_threh_in_y_label=True,figsize=(13,5), n_rows=1, font_size=22, stratify={'column': 'race_binary', 'value': "White"})

In [None]:
# Conf Matrices non White
plot.wrapper_eval_prediction_multi(pip_self=pl_ext,X=pl_ext.data.X_val,y_true=pl_ext.data.y_val,model=pl_ext.master_RFC,thresholds=[(0.4, 0.6), (0.35, 0.55)],incorp_threh_in_y_label=True,figsize=(13,5), n_rows=1, font_size=22, stratify={'column': 'race_binary', 'value': "Non-White"})

### Small info prints

In [None]:
plot_KM(pl, thresholds=(0.35, 0.6), x_scale='y', font_size=24)

In [None]:
plot_KM(pl, thresholds=(0.35, 0.55), x_scale='y', y_scale=(0,1))