# Violin and Precision-Recall Curve Analysis

This script provides functionality for generating violin plots and precision-recall (PR) curves for evaluating model performance. It includes customizable options for visualization, dataset summaries, and statistical comparisons.

# Initialization

In [None]:
import sys
sys.path.append("../../modeling_pipeline") #Because the project is in a different folder (two levels up), we need to add the path to the sys path
sys.path.append("../..")
from pipeline import * #Load our package with classes pipeline, models, pp (preprocessing), plot, and more
from wrapper_violins_prcs import *


#This parows us to automaticpary reload the packages we are working on in the background, no "Restart Kernel" needed
%load_ext autoreload
%autoreload 2


############### CHANGE THE PROJECT PATH ############
path= ("../../../../") # Choose your own project here, only works if you added specific project in user_settings.json
############### CHANGE THIS ############


legend_mgr = LegendDimensionManager() # Set up the legend manager for consistent legends across plots

fig_path = f"{path}/visuals"


# Load the default color dictionary
yaml_colors_path = "custom_colors.yaml"
with open(yaml_colors_path, 'r') as file:
    config = yaml.safe_load(file)
scenarios_colors = config.get("scenarios_colors", {}) # Extract the color dictionary
print("Successfully loaded color dictionary with", len(scenarios_colors), "entries")

scenario_lists = config.get("scenario_lists", {}) # Extract the color dictionary
print("Successfully loaded scenario_lists with", len(scenario_lists), "entries")

title_dict = config.get("title_dict", {}) # Extract the color dictionary
print("Successfully loaded title_dict with", len(title_dict), "entries")

#scenarios_colors= config.get("scenarios_colors_hex", {}) # Extract the color dictionary


plot_colorbar(scenarios_colors)

In [None]:
scenarios_colors

#### Customization Options
- **Color Schemes:** Uses predefined or custom colors.
- **Figure Layout:** Adjustable grid layout (rows/columns).
- **Labeling:** Supports automatic and user-defined labels.
- **Plot Styles:** Various styles including line thickness, transparency, and split visualization.

Color Schemes:

The framework uses either your own pre-defined colors ("custom_colors.yaml") or a default YAML file (`default_colors.yaml`) to define color schemes for different scenarios:

Title dict:

You can customize the label names that should be displayed in the plot in the  ("custom_colors.yaml")

# Data Import

### Prediction Models
Load your prediction score data from our modelling pipeline. This takes some time to load (30 - 60s) as we need the raw prediction data of > 500.000 people...

If using the pipeline with independent data, make sure your files adhere to this format:

|    eid   | status | status_cancerreg |  y_pred   | SEX |
|----------|--------|------------------|-----------|-----|
| 1000000  |   0    |         0        |   0.33    |  0  |
| 1000000  |   0    |         0        | 0.247354  |  1  |
| 1000000  |   0    |         0        | 0.421570  |  0  |
| 1000100  |   0    |         0        | 0.360414  |  0  |
| ...      |        |                  |           |     |
| 9999999  |   0    |         0        | 0.373559  |  1  |


In [None]:
dataframes= joblib.load("../../../../combined_output/val/Prediction_values_combined.joblib")

In [None]:
dataframes = load_prediction_values(os.path.join(path, "combined_output/val/Prediction_values_combined.joblib"), prefix_keys=False)
dataframes.keys()

In [None]:
#Index the dataframes by the model name
dataframes["par_Model_TOP15"]

### Covariates Data

Add SEX Info

In [None]:
df = pd.read_csv(os.path.join(path, "data/23_04_2025/df_all_outer_basic_all.csv"))

df.columns

In [None]:
#Import sex info and merge onto dataframes for sex-stratified analysis
df_sex={}
#df_sex = pd.read_csv(os.path.join(path, "data/dataframes/df_covariates.csv"))
df_sex = pd.read_csv(os.path.join(path, "data/23_04_2025/df_all_outer_basic_all.csv"))

df_sex = df_sex[['eid', 'SEX']]
for key in dataframes:
    dataframes[key] = pd.merge(dataframes[key], df_sex, on='eid')



In [None]:
#debug
#Import sex info and merge onto dataframes for sex-stratified analysis
df_cirrhosis={}
#df_sex = pd.read_csv(os.path.join(path, "data/dataframes/df_covariates.csv"))
df_cirrhosis = pd.read_csv(os.path.join(path, "data/23_04_2025/df_all_outer_basic_all.csv"))

df_cirrhosis = df_cirrhosis[['eid', 'Liver cirrhosis']]


Optional: Add Cirrhosis Info --> Obsolete, use implementation via df_benchmarks

In [None]:
# df_diagnosis = pd.read_csv(path+'/data/dataframes/df_diagnosis_1y.csv')

# df_diagnosis = df_diagnosis[['eid', 'Liver cirrhosis']]

# df_diagnosis["Cirrhosis"] = df_diagnosis["Liver cirrhosis"]
# df_diagnosis.drop(columns="Liver cirrhosis")

# df_diagnosis

In [None]:
# #Import whatever other feature you want to have in the dataframes, e.g. for visualization as dots overlaying scatter plot
# df_diagnosis = pd.read_csv(path+'/data/dataframes/df_diagnosis_1y.csv')

# df_diagnosis = df_diagnosis[['eid', 'Liver cirrhosis']]

# df_diagnosis["Cirrhosis"] = df_diagnosis["Liver cirrhosis"]
# df_diagnosis.drop(columns="Liver cirrhosis")

# for key in dataframes:
#     dataframes[key] = pd.merge(dataframes[key], df_diagnosis, on='eid', how='left')

# # dataframes[key]["Liver cirrhosis"].fillna(0, inplace=True)
# # dataframes[key]["Liver cirrhosis"] = dataframes[key]["Liver cirrhosis"].astype(int)

### Create PAR subset from the All dictionary (for Patients at risk (PAR)) o

In [None]:
# Get the unique eids from both dataframes
#TODO: Create a smoother, more generalized version of the PAR cohort creation, ideally during preprocessing export
all_eids = set(dataframes["all_Model_TOP15"]["eid"])
par_eids = set(dataframes["par_Model_TOP15"]["eid"])

## A: Create and save the dictionary with all eids from both sets
# Create the dictionary
cohort_eids_dict = {
    "all": list(all_eids),
    "par": list(par_eids)
}

#Print summary
print(f"Cohort 'all': {len(cohort_eids_dict['all'])} patients")
print(f"Cohort 'par': {len(cohort_eids_dict['par'])} patients")
print(f"Overlap: {len(set(cohort_eids_dict['all']).intersection(set(cohort_eids_dict['par'])))} patients")

# Step 2: Save the dictionary to a file
with open(path + "/data/cohort_dict_test.json", 'w') as f:
    json.dump(cohort_eids_dict, f, indent=2)
print("Saved cohort_eids_dict_test.json")



# Create a DataFrame with all unique eids from both sets
all_unique_eids = sorted(all_eids.union(par_eids))
df_subsets = pd.DataFrame({"eid": all_unique_eids})

# Mark membership: 1 if eid is in the respective set, else 0
df_subsets["All"] = df_subsets["eid"].apply(lambda x: 1 if x in all_eids else 0)
df_subsets["PAR"] = df_subsets["eid"].apply(lambda x: 1 if x in par_eids else 0)

print(df_subsets.head())
print(df_subsets["All"].value_counts())
print(df_subsets["PAR"].value_counts())


# Save the DataFrame to a CSV file
df_subsets.to_csv(path + "/data/dataframes/df_subsets.csv", index=False)

### Subset eids for for Proteomics 

In [None]:
#Establish a list of eids for for which proteomics is available, for later filtering
df_proteomics_eid = pd.read_csv(
    'D:/OneDrive - Uniklinik RWTH Aachen/drive/public/ukb/extracted/proteomics_wide_instant_0.csv',
    usecols=['eid'],
    dtype={'eid': int}  # optional: specify dtype to reduce memory
)

proteomics_eids = set(df_proteomics_eid['eid'].astype(int))

df_proteomics_eid.shape

### Benchmark Data

Choose literature benchmarks/other models and load them as df_benchmark with y_pred (the prediction scores), the status (ground truth) and ideally info on SEX for stratification analysis

In [None]:
benchmark_all = pd.read_csv(os.path.join(path, 'data/df_benchmark.csv'))

df_cirrhosis = pd.read_csv(os.path.join(path, "data/23_04_2025/df_all_outer_basic_all.csv"))
df_cirrhosis = df_cirrhosis[['eid', 'Liver cirrhosis']]

df_cirrhosis = df_cirrhosis.rename(columns={'Liver cirrhosis': 'Cirrhosis'})
benchmark_all = benchmark_all.merge(df_cirrhosis, on='eid', how='left')


# List of scores to process
#scores = ["aMAP", "APRI", "FIB4", "NFS", "LiverRisk", "AFP", "Cirrhosis"]
scores = ["aMAP", "APRI", "FIB4", "NFS", "Cirrhosis"]

#Add score info to dataframes dictionary, where keys are 'all_score' for score in scores
for score in scores:
    # Create a copy of the benchmark dataframe
    df_temp = benchmark_all.copy()

    # Rename columns
    df_temp = df_temp.rename(columns={
        score: 'y_pred',
        #'gender' : 'SEX'
    })
    df_temp = df_temp[['y_pred', 'status', 'eid']] #removed SEX here

    dataframes[f'all_{score}'] = df_temp    # Add to dataframes dictionary

    print(f"all_{score} added to dataframes.")
print("All scores added to dataframes.")


# Add pre-filtered score info the respective PAR dataframes from the PAR eids and the ALL benchmarks
for score in scores:
    all_key = f'all_{score}'
    par_key = f'par_{score}'
    if all_key in dataframes:
        dataframes[par_key] = dataframes[all_key][dataframes[all_key]["eid"].isin(par_eids)].copy()
        print(f"par_{score} added to dataframes.")


dataframes.keys()


In [None]:
columns_to_impute = ['aMAP', 'NFS', 'FIB4', 'APRI']

# Impute the specified columns with their respective means
for column in columns_to_impute:
    benchmarks[column].fillna(benchmarks[column].mean(), inplace=True)

# Verify the imputation
print("NA counts after imputation:")
print(benchmarks[columns_to_impute].isnull().sum())

# Optional: Display summary statistics of imputed columns
print("\nSummary of imputed columns:")
print(benchmarks[columns_to_impute].describe())

### Proteomics dataframes

In [None]:
for key, df in dataframes.items():
    print(f"{key}: {len(df)} rows (from {key})")

In [None]:
for key, df in dataframes_proteomics.items():
    print(f"{key}: {len(df)} rows (from {key})")

In [None]:
# create proteomics-filtered copies for every dataframe that has a 'y_pred' column
dataframes_proteomics = {}
for key, df in dataframes.items():
    if 'y_pred' not in df.columns:
        continue
    dfp = df[df['eid'].isin(proteomics_eids) & df['y_pred'].notna()].copy()
    new_key = f"{key}"
    dataframes_proteomics[new_key] = dfp
    print(f"{new_key}: {len(dfp)} rows (from {key})")

# optional: inspect keys
print("Proteomics dataframes created:", list(dataframes_proteomics.keys())[:10])

### Double check dataframes

In [None]:
summary_df = summarize_dataframes(dataframes)
summary_df = summary_df.sort_values('Rows', ascending=False) #Sort table
print(summary_df.to_string(index=False))
#summary_df.to_csv('dataframes_summary.csv', index=False)

# Add reduced dataframes for proteomic data

In [None]:
dataframes

# Violin Plots

- Generates violin plots for visualizing the distribution of predicted probabilities.
- Supports:
  - Custom color schemes
  - Splitting by sex
  - Adjustable figure layout (rows/columns)
  - Custom labeling and styling
  - Saving plots in SVG format

### All

This reflects a constellation of both the incremental (A-E) as the separate models. Only works if you have trained all of these model constellations

In [None]:

keys_ordered_all=['all_Model_Demographics', 'all_Model_Diagnosis', 'all_Model_Blood', 'all_Model_SNP', 'all_Model_Metabolomics', 'all_Model_A', 'all_Model_B', 'all_Model_C','all_Model_D', 'all_Model_E']

colors=assign_colors(keys_ordered_all, scenarios_colors)

create_violin_plots(dataframes=dataframes, keys_ordered=keys_ordered_all, title_dict=title_dict, color_dict=colors, display="all", title_display="All", inner_detail="quart", n_cols=10, gap=0, truth="status")

##### All only incremental models (Figure 3a)

In [None]:
keys_ordered_all=['all_Model_A', 'all_Model_B', 'all_Model_C','all_Model_D', 'all_Model_E', 'all_aMAP']

colors=assign_colors(keys_ordered_all, scenarios_colors)

create_violin_plots(dataframes=dataframes, keys_ordered=keys_ordered_all, color_dict=colors,
                    title_dict=title_dict, display="all_inc", title_display="All", inner_detail="quart",
                    n_cols=6, gap=0, split_by_sex=False, truth="status_cancerreg", fig_path=fig_path,
                    highlight_column="Liver cirrhosis")

##### All only separate models (Suppl. Figure 3g)

In [None]:
keys_ordered_all=['all_Model_Demographics', 'all_Model_Diagnosis', 'all_Model_Blood', 'all_Model_SNP', 'all_Model_Metabolomics',]

colors=assign_colors(keys_ordered_all, scenarios_colors)

create_violin_plots(dataframes=dataframes, keys_ordered=keys_ordered_all, color_dict=colors,
                    title_dict=title_dict, display="all_sep", title_display="All", inner_detail="quart",
                    n_cols=5, gap=0, split_by_sex=False, truth="status_cancerreg", fig_path=fig_path)

##### All strat. sex

In [None]:
keys_ordered_all=['all_Model_A', 'all_Model_B', 'all_Model_C','all_Model_D', 'all_Model_E',]

colors=assign_colors(keys_ordered_all, scenarios_colors)

create_violin_plots(dataframes=dataframes, keys_ordered=keys_ordered_all, color_dict=colors,
                    title_dict=title_dict, display="all_sex", title_display="All", inner_detail="quart",
                    n_cols=5, gap=0, split_by_sex=True, truth="status_cancerreg", fig_path=fig_path)

### PAR

In [None]:
keys_ordered_par=['par_Model_Demographics', 'par_Model_Diagnosis', 'par_Model_Blood', 'par_Model_SNP', 'par_Model_Metabolomics', 'par_Model_A', 'par_Model_B', 'par_Model_C','par_Model_D', 'par_Model_E']

colors=assign_colors(keys_ordered_par, scenarios_colors)


create_violin_plots(dataframes=dataframes, keys_ordered=keys_ordered_par, color_dict=colors,
                    title_dict=title_dict, title_display="Chronic Liver Disease", display="par",
                    n_cols=10, gap=0, truth="status_cancerreg", fig_path=fig_path)

##### PAR only incremental (Figure 3d)

In [None]:
keys_ordered_par=['par_Model_A', 'par_Model_B', 'par_Model_C','par_Model_D', 'par_Model_E']

colors=assign_colors(keys_ordered_par, scenarios_colors)

create_violin_plots(dataframes=dataframes, keys_ordered=keys_ordered_par, color_dict=colors,
                    title_dict=title_dict, display="par_inc", title_display="Chronic Liver Disease",
                    n_cols=5, gap=0, split_by_sex=False, fig_path=fig_path,
                    highlight_column="Liver cirrhosis")

##### Par only separate

In [None]:
keys_ordered_par=['par_Model_Demographics', 'par_Model_Diagnosis', 'par_Model_Blood', 'par_Model_SNP', 'par_Model_Metabolomics']

colors=assign_colors(keys_ordered_all, scenarios_colors)

create_violin_plots(dataframes=dataframes, keys_ordered=keys_ordered_par, color_dict=colors,
                    title_dict=title_dict, display="par_sep", title_display="par", inner_detail="quart",
                    n_cols=5, gap=0, split_by_sex=False, truth="status_cancerreg", fig_path=fig_path)

### PAR stratified by sex

In [None]:
keys_ordered_par_sex=['par_Model_A', 'par_Model_B', 'par_Model_C','par_Model_D', 'par_Model_E']

colors=assign_colors(keys_ordered_par_sex, scenarios_colors)

create_violin_plots(dataframes=dataframes, keys_ordered=keys_ordered_par_sex, color_dict=colors,
                    title_dict=title_dict, display="par_sex", title_display="Chronic Liver Disease",
                    n_cols=5, gap=0, split_by_sex=True, fig_path=fig_path)

## All Reduced Models + Benchmarking Literature Scores (Suppl. Figure 3h)

In [None]:
keys_ordered_reduced=['all_Model_C','all_Model_TOP75', 'all_Model_TOP30', 'all_Model_TOP15', 'all_Model_AMAP-RFC', 'all_aMAP']

colors=assign_colors(keys_ordered_all, scenarios_colors)

create_violin_plots(dataframes=dataframes, keys_ordered=keys_ordered_reduced, color_dict=colors,
                    title_dict=title_dict, display="small_benchmark_all", title_display="All", inner_detail="quart",
                    n_cols=6, gap=0, truth="status_cancerreg", font_size=24, fig_path=fig_path, highlight_column="Liver cirrhosis")

## PAR Reduced Models

In [None]:
keys_ordered_reduced=['par_Model_C','par_Model_TOP15', 'par_Model_AMAP-RFC', 'par_aMAP']

colors=assign_colors(keys_ordered_all, scenarios_colors)

create_violin_plots(dataframes=dataframes, keys_ordered=keys_ordered_reduced, color_dict=colors,
                    title_dict=title_dict, display="small_benchmark_par", title_display="PAR", inner_detail="quart",
                    n_cols=6, gap=0, truth="status_cancerreg", font_size=24, fig_path=fig_path, highlight_column="Liver cirrhosis")

## Benchmarking Scores

In [None]:
#WORK IN PROGRESS


keys_ordered_reduced=['all_aMAP', 'all_APRI', 'all_FIB4', 'all_NFS', 'all_Liver cirrhosis']

colors=assign_colors(keys_ordered_all, scenarios_colors)
create_violin_plots(dataframes=dataframes, keys_ordered=keys_ordered_reduced, color_dict=colors,
                    title_dict=title_dict, display="small_benchmark_all", title_display="All", inner_detail="quart",
                    n_cols=6, gap=0, truth="status_cancerreg", font_size=24, fig_path=fig_path, highlight_column="Liver cirrhosis")

##### Split by sex

In [None]:
keys_ordered_reduced=['all_Model_C','all_Model_TOP75', 'all_Model_TOP30', 'all_Model_TOP15', 'all_Model_AMAP-RFC', 'all_aMAP', "all_model_D"]

colors=assign_colors(keys_ordered_all, scenarios_colors)

create_violin_plots(dataframes=dataframes, keys_ordered=keys_ordered_reduced, color_dict=colors,
                    title_dict=title_dict, display="small_benchmark_all_strat", title_display="All", inner_detail="quart",
                    n_cols=6, gap=0, truth="status_cancerreg", font_size=24, split_by_sex=True, fig_path=fig_path)

##### Violins small models PAR

In [None]:
keys_ordered_reduced=['par_Model_C','par_Model_TOP75', 'par_Model_TOP30', 'par_Model_TOP15', 'par_Model_AMAP-RFC', 'par_aMAP']

colors=assign_colors(keys_ordered_all, scenarios_colors)




create_violin_plots(dataframes=dataframes, keys_ordered=keys_ordered_reduced, color_dict=colors,
                    title_dict=title_dict, display="small_benchmark_par", title_display="Patients at risk", inner_detail="quart",
                    n_cols=6, gap=0, truth="status_cancerreg", fig_path=fig_path)

# Precision Recall Curve Visualizations

- Computes and visualizes Precision recall curves (PRCs) for multiple datasets and displays the area under the PRC
- Allows:
  - Overlaying multiple PR curves
  - Filling between standard deviation bounds
  - Customizing colors, line styles, and plot dimensions
  - Saving plots as SVG

### All (Figure 3c)

In [None]:
# add desired models
keys_ordered_all=['all_Model_A', 'all_Model_B', 'all_Model_C','all_Model_D', 'all_Model_E', 'all_aMAP', 'all_APRI', 'all_FIB4', 'all_NFS', 'all_Cirrhosis']
dotted_keys_list = ['all_aMAP', 'all_APRI', 'all_FIB4', 'all_NFS', 'all_Cirrhosis']

#pass colors stored in scenarios_colors, looks first for full match, then for partial match
colors=assign_colors(keys_ordered_all, scenarios_colors)

fig, ax = plt.subplots()

plot_precision_recall_curves(
    dataframes, keys_ordered_all, colors, fig, ax,
    legend_manager=legend_mgr,
    main_legend_dims=['model_type', 'biomarker'],
    show_main_legend=True,
    show_secondary_legend=False,
    dotted_keys=dotted_keys_list,
    display= "All", fill_bet=False,
    title=None, fig_path=fig_path,
    plot_legend=True, lw=3, font_size=20,
    truth="status"
)

### PAR (Figure 3f)

In [None]:
#add desired models
#keys_ordered_par=['par_Model_A', 'par_Model_B', 'par_Model_C','par_Model_D', 'par_Model_E', 'par_aMAP', 'par_APRI', 'par_FIB4', 'par_NFS', 'par_LiverRisk', 'par_Cirrhosis']
keys_ordered_par=['par_Model_A', 'par_Model_B', 'par_Model_C','par_Model_D', 'par_Model_E', 'par_aMAP', 'par_APRI', 'par_FIB4', 'par_NFS', 'par_Cirrhosis']
dotted_keys_list = ['par_aMAP', 'par_APRI', 'par_FIB4', 'par_NFS', 'par_Cirrhosis']

#pass colors stored in scenarios_colors, looks first for full match, then for partial match
colors=assign_colors(keys_ordered_par, scenarios_colors)
fig, ax = plt.subplots()


plot_precision_recall_curves(
    dataframes, keys_ordered_par, colors, fig, ax,
    legend_manager=legend_mgr,
    main_legend_dims=['model_type', 'biomarker'],
    show_main_legend=True,
    show_secondary_legend=False,
    dotted_keys=dotted_keys_list,
    display= "Patients_at_risk", fill_bet=False,
    title=None, fig_path=fig_path,
    plot_legend=True, lw=3, font_size=20,
    ylim=(0,1)
)


## Reduced Models

#### All (Figure 3i)

Full axis

In [None]:
#TODO adapt to new PRC function

# add desired models
keys_ordered_reduced=['all_Model_C','all_Model_TOP75', 'all_Model_TOP30', 'all_Model_TOP15', 'all_Model_AMAP-RFC', 'all_aMAP']

colors=assign_colors(keys_ordered_reduced, scenarios_colors)


fig_all, ax_all = plt.subplots()

plot_precision_recall_curves(dataframes, keys_ordered_reduced, colors=colors, fig=fig_all, ax=ax_all, ylim=(0, 1), display= "All - Reduced_models", fill_bet=False, title='Precision-Recall Curves', fig_path=fig_path, dotted_keys=("all_amap"), plot_legend=True, lw=2, font_size=12)
plt.show()

Closer view

In [None]:
# add desired models
keys_ordered_reduced=['all_Model_C','all_Model_TOP75', 'all_Model_TOP30', 'all_Model_TOP15', 'all_Model_AMAP-RFC', 'all_aMAP']

colors=assign_colors(keys_ordered_reduced, scenarios_colors)


fig_all, ax_all = plt.subplots()

plot_precision_recall_curves(dataframes, keys_ordered_reduced, colors, fig=fig_all, ax=ax_all, ylim=(0, 0.4), display= "All - Reduced_models", fill_bet=False, title='Precision-Recall Curves', fig_path=fig_path, dotted_keys=("all_aMAP"), plot_legend=False, lw=3, font_size=16)
plt.show()

#### PAR (Figure 3l)
Full axis

In [None]:
keys_ordered_reduced=['par_Model_TOP15',  'par_aMAP']

colors=assign_colors(keys_ordered_reduced, scenarios_colors)



fig_par, ax_par = plt.subplots()

plot_precision_recall_curves(dataframes, keys_ordered_reduced, colors, fig=fig_par, ax=ax_par, ylim=(0, 1), display= "Reduced Models - PAR", fill_bet=False, title='PAR_Reduced Models', truth="status", fig_path=fig_path, dotted_keys=("par_aMAP"), plot_legend=True, lw=2, font_size=12)
plt.show()

Closer view

In [None]:
keys_ordered_reduced=['par_Model_C', 'par_Model_TOP75' , 'par_Model_TOP30', 'par_Model_TOP15', 'par_Model_AMAP-RFC', 'par_aMAP']

colors=assign_colors(keys_ordered_reduced, scenarios_colors)



fig_par, ax_par = plt.subplots()

plot_precision_recall_curves(dataframes, keys_ordered_reduced, colors, fig=fig_par, ax=ax_par, ylim=(0, 0.4), display= "Reduced Models - PAR", fill_bet=False, title='PAR_Reduced_PRC', fig_path=fig_path, dotted_keys=("par_aMAP"), plot_legend=False, lw=3, font_size=16)
plt.show()

### Literature benchmarks PRC All

In [None]:

# add desired models
keys_ordered_all=['all_APRI', 'all_NFS', 'all_FIB4','all_Cirrhosis', 'all_aMAP']


colors=assign_colors(keys_ordered_all, scenarios_colors)



fig_all, ax_all = plt.subplots()

plot_precision_recall_curves(dataframes, keys_ordered_all, colors, truth="status", fig=fig_all, ax=ax_all, ylim=(0, 1), display= "Literature Benchmarks", fill_bet=False, title='Precision-Recall Curves', fig_path=fig_path, dotted_keys=("all_aMAP"), font_size=12, lw=2, split_by_sex=False)
plt.show()

## PRC AOU Ext Val

### All

In [None]:
colors

In [None]:
# add desired models
keys_ordered_reduced=['all_Model_TOP30','all_Model_TOP15', 'all_aMAP', 'all_APRI', 'all_NFS', 'all_FIB4']

colors=assign_colors(keys_ordered_reduced, scenarios_colors)


fig, ax = plt.subplots()

plot_precision_recall_curves(dataframes, keys_ordered_reduced, colors, fig, ax,
                              ylim=(0, 1), 
                             truth='status', display= "All - Reduced_models", 
                             fill_bet=False, 
                             legend_manager=legend_mgr,
    main_legend_dims=['model_type', 'biomarker'],
    show_main_legend=True,
    show_secondary_legend=False, 
    plot_legend=True,  
    title='Precision-Recall Curves - All', fig_path=fig_path,
    lw=3, font_size=21, score_decimals=3)


plt.show()

In [None]:
# add desired models
keys_ordered_reduced=['all_Model_TOP30', 'all_Model_TOP15', 'all_aMAP', 'all_APRI', 'all_NFS', 'all_FIB4']

colors=assign_colors(keys_ordered_reduced, scenarios_colors)


fig, ax = plt.subplots()

plot_precision_recall_curves(dataframes, keys_ordered_reduced, colors, fig, ax,
                              ylim=(0, 0.2), 
                             truth='status', display= "All - Reduced_models", 
                             fill_bet=False, 
                             legend_manager=legend_mgr,
    main_legend_dims=['model_type', 'biomarker'],
    show_main_legend=True,
    show_secondary_legend=False, 
    plot_legend=True,  
    title='Precision-Recall Curves - All', fig_path=fig_path,
    lw=4, font_size=24, score_decimals=3)


plt.show()

In [None]:
# add desired models
keys_ordered_reduced=['par_Model_TOP15', 'par_aMAP', 'par_APRI', 'par_NFS', 'par_FIB4']

colors=assign_colors(keys_ordered_reduced, scenarios_colors)


fig, ax = plt.subplots()

plot_precision_recall_curves(dataframes, keys_ordered_reduced, colors, fig, ax,
                              ylim=(0, 1), 
                             truth='status', display= "PAR - Reduced_models", 
                             fill_bet=False, 
                             legend_manager=legend_mgr,
    main_legend_dims=['model_type', 'biomarker'],
    show_main_legend=True,
    show_secondary_legend=False, 
    plot_legend=True,  
    title='Precision-Recall Curves - PAR', fig_path=fig_path,
    lw=3, font_size=21, score_decimals=3)


plt.show()

In [None]:
# add desired models
keys_ordered_reduced=['par_Model_TOP15', 'par_aMAP', 'par_APRI', 'par_NFS', 'par_FIB4']

colors=assign_colors(keys_ordered_reduced, scenarios_colors)


fig, ax = plt.subplots()

plot_precision_recall_curves(dataframes, keys_ordered_reduced, colors, fig, ax,
                              ylim=(0, 0.2), 
                             truth='status', display= "PAR - Reduced_models", 
                             fill_bet=False, 
                             legend_manager=legend_mgr,
    main_legend_dims=['model_type', 'biomarker'],
    show_main_legend=True,
    show_secondary_legend=False, 
    plot_legend=True,  
    title='Precision-Recall Curves - PAR', fig_path=fig_path,
    lw=4, font_size=24, score_decimals=3)


plt.show()

## AFP Benchmark

### PRC All (Suppl. Figure 5b)

In [None]:
# add desired models
keys_ordered=['all_Model_C', 'all_Model_TOP15', 'all_aMAP', 'all_AFP', 'all_Cirrhosis']

colors=assign_colors(keys_ordered, scenarios_colors)

fig, ax = plt.subplots()

# Configuration 1: Show only model types and biomarkers
plot_precision_recall_curves(
    dataframes_proteomics, keys_ordered, colors, fig, ax,
    legend_manager=legend_mgr,
    main_legend_dims=['model_type', 'biomarker'],
    show_main_legend=True,
    show_secondary_legend=False,
    display= "PRC AFP All Proteomics", fill_bet=False,
    title='Precision-Recall Curves - All (Proteomics Subset)', fig_path=fig_path,
    plot_legend=True, lw=3, font_size=21,
)




### PRC PAR (Suppl. Figure 5b)

In [None]:
# add desired models
keys_ordered=['par_Model_C', 'par_Model_TOP15', 'par_aMAP', 'par_AFP', 'par_Cirrhosis']

colors=assign_colors(keys_ordered, scenarios_colors)
fig, ax = plt.subplots()

# Configuration 1: Show only model types and biomarkers
plot_precision_recall_curves(
    dataframes_proteomics, keys_ordered, colors, fig, ax,
    legend_manager=legend_mgr,
    main_legend_dims=['model_type', 'biomarker'],
    show_main_legend=True,
    show_secondary_legend=False,
    display= "PRC AFP PAR Proteomics", fill_bet=False,
    title='Precision-Recall Curves - PAR (Proteomics Subset)', fig_path=fig_path,
    plot_legend=True, lw=3, font_size=21
)




In [None]:
# Debug legend dimension extraction
legend_mgr = LegendDimensionManager()

# Test what dimensions are extracted from your keys
keys_ordered = ['all_Model_C', 'all_Model_TOP15', 'all_aMAP', 'all_AFP']

for key in keys_ordered:
    dimensions = legend_mgr.extract_dimensions(key)
    print(f"Key: {key} -> Dimensions: {dimensions}")

# Venn Diagrams to understand subsetting

### Load df_y (target) and df_par(subsetting)

In [None]:
# lOad y dataframe and subset for y_val (rows where split_ext == 1)
df_y_val = pd.read_csv(path+'/data/dataframes/df_y.csv', usecols=['eid', 'status', 'status_cancerreg', 'split_ext'])
df_par = pd.read_csv(path+'/data/dataframes/df_subsets.csv')

#select rows where column split ext ==1
df_y_val
df_par

Load proteomics

In [None]:
# Load proteomics eids again for reference
df_proteomics_eid = pd.read_csv(
    'D:/OneDrive - Uniklinik RWTH Aachen/drive/public/ukb/extracted/proteomics_wide_instant_0.csv',
    usecols=['eid'],
    dtype={'eid': int}  # optional: specify dtype to reduce memory
)

#Add new column "Proteomics" which has value == 1 for all rows, as preparation for venn diagram
df_proteomics_eid["Proteomics"] = 1

df_proteomics_eid


In [None]:
df_venn = df_par.merge(df_y_val, on='eid', how='outer')

df_venn = df_venn.merge(df_proteomics_eid, on='eid', how='outer')

df_venn.columns

df_venn[df_venn['split_ext'] == True]

### Define Venn Diagram

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib_venn import venn3, venn3_circles
import pandas as pd
import numpy as np

def create_hcc_venn_diagram(df_venn, title="HCC Cohort Analysis", figsize=(12, 10),
                           label_fontsize=12, title_fontsize=16, summary_text=False, path=path):
    """
    Create a Venn diagram showing overlaps between Validation, PAR, and Proteomics cohorts
    with both total cohort size and HCC counts in each region.

    Parameters:
    -----------
    df_venn : pd.DataFrame
        DataFrame with columns: eid, All, PAR, status, split_ext, status_cancerreg, Proteomics
    title : str
        Title for the plot
    figsize : tuple
        Figure size (width, height)
    label_fontsize : int
        Font size for the labels inside circles
    title_fontsize : int
        Font size for the main title
    save_path : str, optional
        Path to save the figure

    Returns:
    --------
    fig, ax : matplotlib figure and axis objects
    """

    # Define the three sets based on your criteria
    validation_set = set(df_venn[df_venn['split_ext'] == 1]['eid'])
    par_set = set(df_venn[df_venn['PAR'] == 1]['eid'])
    proteomics_set = set(df_venn[df_venn['Proteomics'] == 1]['eid'])

    # Calculate all possible intersections
    regions = {
        'validation_only': validation_set - par_set - proteomics_set,
        'par_only': par_set - validation_set - proteomics_set,
        'proteomics_only': proteomics_set - validation_set - par_set,
        'validation_par': (validation_set & par_set) - proteomics_set,
        'validation_proteomics': (validation_set & proteomics_set) - par_set,
        'par_proteomics': (par_set & proteomics_set) - validation_set,
        'all_three': validation_set & par_set & proteomics_set
    }

    # Function to get counts for a set of EIDs
    def get_counts(eid_set):
        if len(eid_set) == 0:
            return 0, 0, 0

        subset_df = df_venn[df_venn['eid'].isin(eid_set)]
        total_count = len(subset_df)
        hcc_overall = subset_df['status'].sum()
        hcc_cancerreg = subset_df['status_cancerreg'].sum()

        return total_count, hcc_overall, hcc_cancerreg

    # Calculate counts for each region
    region_stats = {}
    for region_name, eid_set in regions.items():
        total, hcc_overall, hcc_cancerreg = get_counts(eid_set)
        region_stats[region_name] = {
            'total': total,
            'hcc_overall': hcc_overall,
            'hcc_cancerreg': hcc_cancerreg
        }

    # Prepare data for venn3 (order: A_only, B_only, AB, C_only, AC, BC, ABC)
    venn_sizes = (
        region_stats['validation_only']['total'],      # Validation only
        region_stats['par_only']['total'],             # PAR only
        region_stats['validation_par']['total'],       # Validation & PAR
        region_stats['proteomics_only']['total'],      # Proteomics only
        region_stats['validation_proteomics']['total'], # Validation & Proteomics
        region_stats['par_proteomics']['total'],       # PAR & Proteomics
        region_stats['all_three']['total']             # All three
    )

    # Create the plot
    fig, ax = plt.subplots(figsize=figsize)

    # Create Venn diagram with custom colors
    colors = ['#d0a79a', '#C13617', '#99FF99']  # Soft red, blue, green
    venn = venn3(subsets=venn_sizes,
                set_labels=('Validation Cohort', 'PAR Cohort', 'Proteomics Cohort'),
                ax=ax, alpha=0.7)

    # Customize colors
    if venn.get_patch_by_id('100'):
        venn.get_patch_by_id('100').set_color(colors[0])
    if venn.get_patch_by_id('010'):
        venn.get_patch_by_id('010').set_color(colors[1])
    if venn.get_patch_by_id('001'):
        venn.get_patch_by_id('001').set_color(colors[2])
    if venn.get_patch_by_id('110'):
        venn.get_patch_by_id('110').set_color('#FFCC99')  # Light orange
    if venn.get_patch_by_id('101'):
        venn.get_patch_by_id('101').set_color('#FFB3FF')  # Light purple
    if venn.get_patch_by_id('011'):
        venn.get_patch_by_id('011').set_color('#B3FFCC')  # Light mint
    if venn.get_patch_by_id('111'):
        venn.get_patch_by_id('111').set_color('#FFFFB3')  # Light yellow

    # Add circles for better definition
    venn3_circles(subsets=venn_sizes, ax=ax, linewidth=0.5, color='black')

    # Custom labels with both total and HCC counts
    label_mapping = {
        '100': 'validation_only',
        '010': 'par_only',
        '001': 'proteomics_only',
        '110': 'validation_par',
        '101': 'validation_proteomics',
        '011': 'par_proteomics',
        '111': 'all_three'
    }

    # Add custom labels
    for patch_id, region_name in label_mapping.items():
        label_obj = venn.get_label_by_id(patch_id)
        if label_obj and region_stats[region_name]['total'] > 0:
            stats = region_stats[region_name]
            # Create multi-line label with total and HCC counts
            label_text = f"n = {stats['total']:,}\nHCC = {stats['hcc_overall']}"



            label_obj.set_text(label_text)
            label_obj.set_fontsize(label_fontsize)

    # Customize set labels
    for label in venn.set_labels:
        if label:
            label.set_fontsize(label_fontsize + 2)


    # Add title
    ax.set_title(title, fontsize=title_fontsize,  pad=20)

    # Add summary statistics as text box
    total_validation = len(validation_set)
    total_par = len(par_set)
    total_proteomics = len(proteomics_set)
    total_overall = len(df_venn)
    total_hcc = df_venn['status'].sum()

    if summary_text:
        summary_text = f"""Summary Statistics:
    Total Cohort: {total_overall:,} patients
    • Validation Cohort: {total_validation:,} patients
    • PAR Cohort: {total_par:,} patients
    • Proteomics Cohort: {total_proteomics:,} patients
    Total HCC Cases: {total_hcc:,}"""

        # Position summary box
        ax.text(-0.6, -0.7, summary_text, transform=ax.transAxes, fontsize=10,
                bbox=dict(boxstyle="round,pad=0.5", facecolor="lightgray", alpha=0.8),
                verticalalignment='top')

    # Clean up the plot
    ax.set_xlim(-0.8, 0.8)
    ax.set_ylim(-0.8, 0.8)

    plt.tight_layout()
    save_path = os.path.join(path, "visuals/Venn_Overlap_Cohorts.svg")

    # Save if path provided
    plt.savefig(save_path, dpi=300, bbox_inches='tight', transparent=True)
    print(f"Figure saved to: {save_path}")

    # Print detailed statistics
    print("\n" + "="*60)
    print("DETAILED VENN DIAGRAM STATISTICS")
    print("="*60)

    region_names = {
        'validation_only': 'Validation Only',
        'par_only': 'PAR Only',
        'proteomics_only': 'Proteomics Only',
        'validation_par': 'Validation + PAR',
        'validation_proteomics': 'Validation + Proteomics',
        'par_proteomics': 'PAR + Proteomics',
        'all_three': 'All Three Cohorts'
    }

    for region_key, region_display in region_names.items():
        stats = region_stats[region_key]
        if stats['total'] > 0:
            hcc_rate = (stats['hcc_overall'] / stats['total']) * 100
            print(f"\n{region_display}:")
            print(f"  Total: {stats['total']:,} patients")
            print(f"  HCC (overall): {stats['hcc_overall']:,} ({hcc_rate:.2f}%)")
            if stats['hcc_cancerreg'] != stats['hcc_overall']:
                cr_rate = (stats['hcc_cancerreg'] / stats['total']) * 100
                print(f"  HCC (cancer registry): {stats['hcc_cancerreg']:,} ({cr_rate:.2f}%)")

    return fig, ax, region_stats

# Example usage:
# fig, ax, stats = create_hcc_venn_diagram(
#     df_venn,
#     title="HCC Risk Prediction Cohorts Analysis",
#     figsize=(14, 10),
#     label_fontsize=11,
#     save_path="hcc_venn_diagram.png"
# )

In [None]:
validation_set = set(df_venn[df_venn['split_ext'] == 1]['eid'])
par_set = set(df_venn[df_venn['PAR'] == 1]['eid'])
proteomics_set = set(df_venn[df_venn['Proteomics'] == 1]['eid'])


validation_set - par_set - proteomics_set

In [None]:
fig, ax, stats = create_hcc_venn_diagram(
    df_venn,
    title=None,
    figsize=(14, 10),
    label_fontsize=24,
    path=path
)