In [1]:
#### ------------------------------------------------------------------------------------------
#### author: Ranjan Barman, date: Mar 6, 2025
#### Mapped CPTAC HoverNet NPIFs to TCGA_BRCA subtypes status
#### --------------------------------------------------------------------------------------------

import os
import pandas as pd

# Set working directory
_wpath_ = "/data/Lab_ruppin/Ranjan/HnE/"
os.chdir(_wpath_)

print(f"Working directory: {_wpath_}\n")

# Define dataset name and output file path dynamically
dataset_name = "CPTAC_BRCA"

# File paths
CPTAC_BRCA_HoverNet_NPIFs_file = f"{dataset_name}/HoverNet/outputs/CPTAC_BRCA_HoverNet_NPIFs_Filtered_Tiles_Top25Q.csv"
CPTAC_subtypes_file = "/data/Lab_ruppin/dhrubas2/HnE/CPTAC_BRCA/processed/CPTAC_BRCA_clinical_data_summary_matched.tsv"

# Load the original NPIFs values data
CPTAC_BRCA_HoverNet_NPIFs = pd.read_csv(CPTAC_BRCA_HoverNet_NPIFs_file)

# Drop the unnecessary columns
CPTAC_BRCA_HoverNet_NPIFs.drop(columns=["Total_Tiles", "Filtered_Tiles"], inplace=True)

# Remove trailing spaces from column names
CPTAC_BRCA_HoverNet_NPIFs.columns = CPTAC_BRCA_HoverNet_NPIFs.columns.str.strip()


# Load the TCGA BRCA subtype data
CPTAC_subtypes_data = pd.read_table(CPTAC_subtypes_file, sep="\t")

# Keep only the relevant columns from TCGA subtypes
CPTAC_subtypes_data = CPTAC_subtypes_data[["Patient_ID", "HER2_status", "PR_status", "ER_status"]]

# Rename columns to match the expected format
CPTAC_subtypes_data = CPTAC_subtypes_data.rename(columns={
    "HER2_status": "HER2_Status",
    "PR_status": "PR_Status",
    "ER_status": "ER_Status"
})


# Convert Patient_ID to string for proper merging
CPTAC_subtypes_data["Patient_ID"] = CPTAC_subtypes_data["Patient_ID"].astype(str)

# Zero-pad Patient_ID to ensure it has three digits
CPTAC_subtypes_data["Patient_ID"] = CPTAC_subtypes_data["Patient_ID"].astype(str).str.zfill(3)


CPTAC_BRCA_HoverNet_NPIFs
CPTAC_subtypes_data


Working directory: /data/Lab_ruppin/Ranjan/HnE/



Unnamed: 0,Patient_ID,HER2_Status,PR_Status,ER_Status
0,01BR001,Negative,Negative,Negative
1,01BR008,,,
2,01BR009,,Negative,Negative
3,01BR010,,Negative,Negative
4,01BR015,Negative,Positive,Positive
...,...,...,...,...
114,21BR001,,Negative,Negative
115,21BR002,,Positive,Positive
116,21BR010,Negative,Positive,Positive
117,22BR005,Positive,Positive,Positive


In [2]:
# Print unique Slide_Name and Patient_ID values
print("Patient_IDs from CPTAC_BRCA_HoverNet_NPIFs (Total: {}):".format(len(CPTAC_BRCA_HoverNet_NPIFs)))
print(CPTAC_BRCA_HoverNet_NPIFs["Patient_ID"].unique())

print("\nPatient IDs from CPTAC_subtypes_data (Total: {}):".format(len(CPTAC_subtypes_data)))
print(CPTAC_subtypes_data["Patient_ID"].unique())

# Find non-matching values
slide_names = set(CPTAC_BRCA_HoverNet_NPIFs["Patient_ID"])
patient_ids = set(CPTAC_subtypes_data["Patient_ID"])

# Find slides that are not matching with Patient_IDs
non_matching_slides = slide_names - patient_ids
non_matching_patients = patient_ids - slide_names

print("\nHoverNet NPIFs that do NOT have a matching Patient_ID:")
print(non_matching_slides)

print("\n BRCA_Status that do NOT have a matching Patient_ID:")
print(non_matching_patients)

Patient_IDs from CPTAC_BRCA_HoverNet_NPIFs (Total: 198):
['01BR001' '01BR002' '01BR003' '01BR004' '01BR005' '01BR006' '01BR007'
 '01BR008' '01BR009' '01BR010' '01BR011' '01BR012' '01BR013' '01BR014'
 '01BR015' '01BR016' '01BR017' '01BR018' '01BR019' '01BR020' '01BR021'
 '01BR022' '01BR023' '01BR024' '01BR025' '01BR026' '01BR027' '01BR028'
 '01BR029' '01BR030' '01BR031' '01BR032' '01BR033' '01BR034' '01BR039'
 '01BR040' '01BR041' '01BR042' '01BR043' '03BR002' '03BR004' '03BR005'
 '03BR006' '03BR008' '03BR009' '03BR010' '03BR011' '03BR012' '03BR013'
 '03BR014' '05BR001' '05BR002' '05BR003' '05BR004' '05BR005' '05BR009'
 '05BR010' '05BR015' '05BR016' '05BR019' '05BR022' '05BR024' '05BR026'
 '05BR029' '05BR031' '05BR032' '05BR033' '05BR038' '05BR040' '05BR042'
 '05BR043' '05BR044' '05BR045' '06BR003' '06BR005' '06BR006' '06BR009'
 '06BR014' '09BR001' '09BR002' '09BR004' '09BR005' '09BR007' '11BR003'
 '11BR004' '11BR005' '11BR006' '11BR009' '11BR010' '11BR011' '11BR012'
 '11BR013' '11BR014'

In [3]:
# Merge NPIFs with TCGA subtypes data on Sample_ID
merged_df = pd.merge(CPTAC_subtypes_data, CPTAC_BRCA_HoverNet_NPIFs, on='Patient_ID', how="inner")


# Define output file path
output_path = f"{dataset_name}/outputs/HoverNet/Subtypes/"

# Ensure the directory exists
if not os.path.exists(output_path):
    os.makedirs(output_path, exist_ok=True)

# file_name for output file
file_name = "HoverNet_Original_NPIFs_Values_CPTAC_BRCA_Mapped_BRCA_Status_Filtered_Tiles_Top25Q.csv"

output_file = os.path.join(output_path, file_name)

output_file


# Save merged data
merged_df.to_csv(output_file, index=False)

print(f"Mapped data saved to: {output_file}")
print("Done!")

merged_df


Mapped data saved to: CPTAC_BRCA/outputs/HoverNet/Subtypes/HoverNet_Original_NPIFs_Values_CPTAC_BRCA_Mapped_BRCA_Status_Filtered_Tiles_Top25Q.csv
Done!


Unnamed: 0,Patient_ID,HER2_Status,PR_Status,ER_Status,Mean Area,Mean Major Axis,Mean Minor Axis,Mean Perimeter,Mean Eccentricity,Mean Circularity,Std Area,Std Major Axis,Std Minor Axis,Std Perimeter,Std Eccentricity,Std Circularity
0,01BR001,Negative,Negative,Negative,12.703960,5.169094,3.321895,14.487946,0.721761,0.719830,6.908425,1.501187,0.962863,4.068891,0.144744,0.103239
1,01BR008,,,,10.817211,4.775795,3.093159,13.542120,0.715376,0.711812,6.334456,1.491132,0.910863,4.246940,0.143972,0.115293
2,01BR009,,Negative,Negative,12.133630,4.977328,3.265108,14.034705,0.710987,0.729620,7.087879,1.493092,0.986528,4.157464,0.145807,0.099825
3,01BR010,,Negative,Negative,11.253210,4.854933,3.144331,13.596457,0.717399,0.729105,5.891783,1.374962,0.861662,3.686328,0.142571,0.101731
4,01BR015,Negative,Positive,Positive,12.315714,5.140014,3.253388,14.519424,0.733099,0.696569,7.382770,1.572019,1.004430,4.586197,0.140369,0.116484
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,21BR001,,Negative,Negative,10.820139,4.740288,3.113685,13.319329,0.708837,0.738284,5.328559,1.295681,0.801161,3.494132,0.144410,0.102292
115,21BR002,,Positive,Positive,10.808911,4.777671,3.113268,13.278989,0.713841,0.747466,5.095594,1.281168,0.764665,3.419467,0.144608,0.096143
116,21BR010,Negative,Positive,Positive,11.659692,4.972307,3.216560,14.037889,0.718977,0.715346,6.274856,1.441740,0.908870,4.062868,0.142598,0.108977
117,22BR005,Positive,Positive,Positive,11.887081,5.031690,3.224500,14.127232,0.723596,0.715593,6.209482,1.426800,0.893967,3.859797,0.143325,0.103252
