In [1]:
#### ------------------------------------------------------------------------------------------
#### author: Ranjan Barman, date: June 22, 2025
#### Mapped POST_NAT HoverNet NPIFs to BRCA subtypes status (using top 25  tiles)
#### Includes patient ID trace and unmatched checks (cleaned version)
#### ------------------------------------------------------------------------------------------

import os
import pandas as pd

# Set working directory
_wpath_ = "/data/Lab_ruppin/Ranjan/HnE/"
os.chdir(_wpath_)
print(f"Working directory: {_wpath_}\n")

# Dataset name
dataset_name = "POST_NAT_BRCA"

# File paths
npif_file = f"{dataset_name}/HoverNet/outputs/POST_NAT_BRCA_HoverNet_NPIFs_Filtered_Tiles_Top25Q.csv"
slide_list_file = "/data/Ruppin_AI/Datasets/Post_NAT_BRCA/processed/Post_NAT_BRCA_slide_list.tsv"
clinical_metadata_file = "/data/Ruppin_AI/Datasets/Post_NAT_BRCA/processed/Post_NAT_BRCA_clinical_metadata_short.tsv"

# Load NPIFs and extract Slide_ID
npif_df = pd.read_csv(npif_file)
npif_df["Slide_ID"] = npif_df["Slide_ID"].astype(int)

# Load slide-to-patient mapping and clinical metadata
slide_list_df = pd.read_csv(slide_list_file, sep="\t")
clinical_metadata_df = pd.read_csv(clinical_metadata_file, sep="\t")
slide_list_df
clinical_metadata_df

Working directory: /data/Lab_ruppin/Ranjan/HnE/



Unnamed: 0,Patient_ID,Age,Menopausal_status,Lymphovascular_invasion,Histology_type,Histology_grade,HER2_status,ER_status,PR_status,Clinical_subtype,Clinical_subtype_fine,NAT_regimen,Surgery_type_breast,Surgery_type_LN,Response
0,P1,67,Post,1.0,IDC,3,0.0,1.0,1.0,HR+,HR+,Chemo+Endocrine,Total mastectomy,Axillary LN dissection,PDR
1,P2,30,Pre,1.0,IDC,3,1.0,0.0,0.0,HER2+,HER2+,Chemo+Anti-HER2,Total mastectomy,Axillary LN dissection,PDR
2,P3,65,Post,1.0,IDC,2,0.0,1.0,1.0,HR+,HR+,Chemo,Partial mastectomy/lumpectomy,Axillary LN dissection,PDR
3,P4,41,Pre,0.0,IDC,2,0.0,1.0,1.0,HR+,HR+,Chemo,Total mastectomy,Sentinel LN biopsy,NDR
4,P5,47,Pre,0.0,IDC,Undetermined,1.0,1.0,1.0,HER2+,TPBC,Chemo+Anti-HER2,Total mastectomy,Axillary LN dissection,PDR
5,P6,58,Post,1.0,ILC,2,0.0,1.0,1.0,HR+,HR+,Chemo,Total mastectomy,Axillary LN dissection,PDR
6,P7,39,Pre,0.0,IDC,3,0.0,0.0,0.0,TNBC,TNBC,Chemo,Total mastectomy,Sentinel LN biopsy,PDR
7,P8,52,Pre,0.0,IDC,2,0.0,1.0,1.0,HR+,HR+,Chemo,Total mastectomy,Axillary LN dissection,PDR
8,P9,34,Pre,0.0,IDC,2,1.0,1.0,0.0,HER2+,TPBC,Chemo+Anti-HER2,Total mastectomy,Axillary LN dissection,PDR
9,P10,55,Post,1.0,IDC,2,0.0,1.0,1.0,HR+,HR+,Chemo,Total mastectomy,Axillary LN dissection,PDR


In [2]:
# Merge Slide_ID to get Patient_ID
npif_mapped_df = pd.merge(npif_df, slide_list_df[["Patient_ID", "Slide_ID"]], on="Slide_ID", how="left")
npif_mapped_df = npif_mapped_df.dropna(subset=["Patient_ID"])
npif_mapped_df["Patient_ID"] = npif_mapped_df["Patient_ID"].astype(str)

# Drop any pre-existing clinical columns to avoid duplication
columns_to_drop = ["HER2_Status", "ER_Status", "PR_Status", "Clinical_subtype"]
npif_mapped_df = npif_mapped_df.drop(columns=[col for col in columns_to_drop if col in npif_mapped_df.columns])

# Prepare and rename clinical subtype columns
clinical_metadata_df["Patient_ID"] = clinical_metadata_df["Patient_ID"].astype(str)
subtypes_df = clinical_metadata_df.rename(columns={
    "HER2_status": "HER2_Status",
    "ER_status": "ER_Status",
    "PR_status": "PR_Status"
})[["Patient_ID", "HER2_Status", "ER_Status", "PR_Status", "Clinical_subtype"]]

# Map binary values to 'Positive'/'Negative'
binary_map = {1.0: "Positive", 0.0: "Negative"}
subtypes_df["HER2_Status"] = subtypes_df["HER2_Status"].map(binary_map)
subtypes_df["ER_Status"] = subtypes_df["ER_Status"].map(binary_map)
subtypes_df["PR_Status"] = subtypes_df["PR_Status"].map(binary_map)

# Merge subtype info into npif_mapped_df
merged_df = pd.merge(subtypes_df, npif_mapped_df, on="Patient_ID", how="inner")

# Reorder to make Patient_ID the first column
cols = merged_df.columns.tolist()
cols.insert(0, cols.pop(cols.index("Patient_ID")))
merged_df = merged_df[cols]

# Print unique Slide_Name and Patient_ID values
print("Patient_IDs from POST_NAT_BRCA_HoverNet_NPIFs (Total: {}):".format(len(merged_df)))
print(merged_df["Patient_ID"].unique())

print("\nPatient IDs from POST_NAT clinical metadata (Total: {}):".format(len(subtypes_df)))
print(subtypes_df["Patient_ID"].unique())

# Find non-matching values
slide_ids = set(merged_df["Patient_ID"])
patient_ids = set(subtypes_df["Patient_ID"])

non_matching_slides = slide_ids - patient_ids
non_matching_patients = patient_ids - slide_ids

print("\nHoverNet NPIFs that do NOT have a matching Patient_ID:")
print(non_matching_slides)

print("\nClinical metadata entries that do NOT have a matching Slide NPIF:")
print(non_matching_patients)



Patient_IDs from POST_NAT_BRCA_HoverNet_NPIFs (Total: 93):
['P1' 'P2' 'P3' 'P4' 'P5' 'P6' 'P7' 'P8' 'P9' 'P10' 'P11' 'P12' 'P13'
 'P14' 'P15' 'P16' 'P17' 'P18' 'P19' 'P20' 'P21' 'P22' 'P23' 'P24' 'P25'
 'P26' 'P27' 'P28' 'P29' 'P30' 'P31' 'P32' 'P33' 'P34' 'P35' 'P37' 'P38'
 'P39' 'P40' 'P41' 'P42' 'P43' 'P44' 'P45' 'P46' 'P47' 'P48' 'P49' 'P50'
 'P51' 'P52' 'P53' 'P54']

Patient IDs from POST_NAT clinical metadata (Total: 54):
['P1' 'P2' 'P3' 'P4' 'P5' 'P6' 'P7' 'P8' 'P9' 'P10' 'P11' 'P12' 'P13'
 'P14' 'P15' 'P16' 'P17' 'P18' 'P19' 'P20' 'P21' 'P22' 'P23' 'P24' 'P25'
 'P26' 'P27' 'P28' 'P29' 'P30' 'P31' 'P32' 'P33' 'P34' 'P35' 'P36' 'P37'
 'P38' 'P39' 'P40' 'P41' 'P42' 'P43' 'P44' 'P45' 'P46' 'P47' 'P48' 'P49'
 'P50' 'P51' 'P52' 'P53' 'P54']

HoverNet NPIFs that do NOT have a matching Patient_ID:
set()

Clinical metadata entries that do NOT have a matching Slide NPIF:
{'P36'}


In [3]:
# Save merged (slide-level) result
output_dir = f"{dataset_name}/outputs/HoverNet/Subtypes/"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "HoverNet_Original_NPIFs_Values_POST_NAT_BRCA_Mapped_BRCA_Status_Filtered_Tiles_Top25Q.csv")
# --------------------------------------
# Patient-level aggregation of NPIFs
# --------------------------------------

# Identify NPIF feature columns to average
mean_std_cols = [col for col in merged_df.columns if col.startswith("Mean ") or col.startswith("Std ")]

# Aggregate NPIFs by Patient_ID (mean), keep first for clinical/status columns
patient_level_df = merged_df.groupby("Patient_ID").agg({
    "HER2_Status": "first",
    "ER_Status": "first",
    "PR_Status": "first",
    "Clinical_subtype": "first",
    **{col: "mean" for col in mean_std_cols}
}).reset_index()

# Save patient-level aggregated results to the same file (overwrite)
patient_level_df.to_csv(output_file, index=False)
print(f"\nPatient-level averaged data saved to: {output_file}")
patient_level_df


Patient-level averaged data saved to: POST_NAT_BRCA/outputs/HoverNet/Subtypes/HoverNet_Original_NPIFs_Values_POST_NAT_BRCA_Mapped_BRCA_Status_Filtered_Tiles_Top25Q.csv


Unnamed: 0,Patient_ID,HER2_Status,ER_Status,PR_Status,Clinical_subtype,Mean Area,Mean Major Axis,Mean Minor Axis,Mean Perimeter,Mean Eccentricity,Mean Circularity,Std Area,Std Major Axis,Std Minor Axis,Std Perimeter,Std Eccentricity,Std Circularity
0,P1,Negative,Positive,Positive,HR+,10.352435,4.545526,3.075789,12.933216,0.689033,0.74992,4.690496,1.186852,0.70936,3.085005,0.146831,0.09352
1,P10,Negative,Positive,Positive,HR+,9.244911,4.364864,2.888366,12.404201,0.700371,0.733826,4.606881,1.254504,0.704801,3.400931,0.146093,0.104118
2,P11,Negative,Negative,Negative,TNBC,10.982113,4.658496,3.188273,13.379289,0.680822,0.740327,5.70687,1.327943,0.811596,3.578504,0.148359,0.101595
3,P12,Negative,Negative,Negative,TNBC,9.004845,4.347669,2.834071,12.205108,0.710753,0.73648,4.654174,1.234522,0.726943,3.337302,0.146834,0.10376
4,P13,Negative,Negative,Negative,TNBC,14.567301,5.302156,3.669889,15.1435,0.676652,0.758064,7.340379,1.443294,0.941179,3.861904,0.147094,0.088999
5,P14,Negative,Positive,Positive,HR+,14.540743,5.317316,3.568992,15.001859,0.69498,0.752894,8.74611,1.668775,1.047991,4.391279,0.145831,0.0946
6,P15,Negative,Positive,Positive,HR+,9.454632,4.40653,2.926935,12.489124,0.69991,0.738576,4.474902,1.184091,0.721116,3.19826,0.148328,0.102046
7,P16,Negative,Positive,Positive,HR+,11.799307,4.925793,3.222177,13.898565,0.710026,0.730375,6.831412,1.492752,0.935581,4.066019,0.146962,0.107997
8,P17,Negative,Positive,Positive,HR+,11.725796,4.778177,3.26616,13.606413,0.681664,0.75179,6.218606,1.363842,0.882964,3.637754,0.152339,0.099539
9,P18,Negative,Negative,Negative,TNBC,9.531761,4.525995,2.87139,12.747953,0.724608,0.708362,4.879317,1.306762,0.762774,3.457569,0.145862,0.10503
