In [1]:
#### ---------------------------------------------------------------------------------------------
#### author: Ranjan Barman, date: Mar 6, 2025
#### Mapped HoverNet NPIFs to TCGA_BRCA subtypes status (all subtype status) filtered tiles top 25Q
#### -----------------------------------------------------------------------------------------------

import os
import pandas as pd

# Set working directory
_wpath_ = "/data/Lab_ruppin/Ranjan/HnE/"
os.chdir(_wpath_)

print(f"Working directory: {_wpath_}\n")

# Define dataset and file paths
dataset_name = "TCGA_BRCA_FFPE"

# File paths
hovernet_Predicted_NPIFs_TCGA_BRCA_file = f"{dataset_name}/outputs/HoverNet/HoverNet_NPIFs_TCGA_BRCA_1106_Filtered_Top25Q.csv"
tcga_subtypes_file = "PA_HIF_BRCA/outputs_clinical/TCGA_BRCA_Subtypes_clinical.tsv"

# Load the original NPIFs values data
hovernet_Predicted_NPIFs_TCGA_BRCA = pd.read_csv(hovernet_Predicted_NPIFs_TCGA_BRCA_file)

# Drop the unnecessary columns
hovernet_Predicted_NPIFs_TCGA_BRCA.drop(columns=["Total_Tiles", "Filtered_Tiles"], inplace=True)

# Remove trailing spaces from column names
hovernet_Predicted_NPIFs_TCGA_BRCA.columns = hovernet_Predicted_NPIFs_TCGA_BRCA.columns.str.strip()

# Extract the first 12 characters of Slide_Name to create a new Sample_ID column
hovernet_Predicted_NPIFs_TCGA_BRCA["Slide_Name"] = hovernet_Predicted_NPIFs_TCGA_BRCA["Slide_Name"].str[:12]

# Drop duplicate Slide_Name in hovernet_Predicted_NPIFs_TCGA_BRCA (keep the first occurrence)
hovernet_Predicted_NPIFs_TCGA_BRCA = hovernet_Predicted_NPIFs_TCGA_BRCA.drop_duplicates(subset=['Slide_Name'], keep='first')


# Load the TCGA BRCA subtype data
tcga_subtypes_data = pd.read_table(tcga_subtypes_file, sep="\t")

# Keep only the relevant columns from TCGA subtypes
tcga_subtypes_data = tcga_subtypes_data[["sampleID", "HER2_Final_Status_nature2012", "PR_Status_nature2012", "ER_Status_nature2012"]]

# Rename columns to match the expected format
tcga_subtypes_data = tcga_subtypes_data.rename(columns={
    "HER2_Final_Status_nature2012": "HER2_Status",
    "PR_Status_nature2012": "PR_Status",
    "ER_Status_nature2012": "ER_Status"
})

# Remove '-01' suffix from Sample_ID to match NPIFs data
tcga_subtypes_data["sampleID"] = tcga_subtypes_data["sampleID"].str.replace('-01', '', regex=False)

hovernet_Predicted_NPIFs_TCGA_BRCA
# tcga_subtypes_data


Working directory: /data/Lab_ruppin/Ranjan/HnE/



Unnamed: 0,Slide_Name,Mean Area,Mean Major Axis,Mean Minor Axis,Mean Perimeter,Mean Eccentricity,Mean Circularity,Std Area,Std Major Axis,Std Minor Axis,Std Perimeter,Std Eccentricity,Std Circularity
0,TCGA-D8-A13Z,15.418755,5.614750,3.623911,15.739790,0.715026,0.726789,8.738497,1.736600,1.114784,4.556289,0.150470,0.100089
1,TCGA-AR-A0TR,11.008577,4.714941,3.153885,13.346091,0.698523,0.737001,5.874191,1.305372,0.888298,3.605306,0.149508,0.099834
2,TCGA-D8-A1JN,10.014065,4.323268,3.125443,12.472554,0.646506,0.784754,3.793265,0.929303,0.656990,2.484995,0.151348,0.086261
3,TCGA-A2-A0ES,11.380890,4.583000,3.342480,13.223526,0.639347,0.789252,4.738762,1.078397,0.716613,2.875026,0.148913,0.083765
4,TCGA-C8-A12U,13.390488,5.040054,3.525133,14.524594,0.669280,0.747041,7.155830,1.377498,1.057996,3.937260,0.158527,0.113454
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1098,TCGA-A2-A4RX,12.453876,4.931563,3.328805,13.984596,0.684745,0.748041,7.484693,1.594685,0.982135,4.246710,0.153367,0.098541
1099,TCGA-BH-A0AY,12.299375,5.030332,3.272909,14.114666,0.709352,0.729829,7.260146,1.597837,0.985520,4.262205,0.150112,0.100775
1101,TCGA-BH-A18S,10.075491,4.397262,3.114423,12.649830,0.656577,0.771381,4.109242,1.092722,0.627684,2.843544,0.149277,0.090938
1103,TCGA-EW-A1IX,7.932001,4.047490,2.691917,11.433014,0.692919,0.748016,4.039276,1.211220,0.654713,3.295550,0.151148,0.114692


In [2]:
# Merge NPIFs with TCGA subtypes data on Sample_ID
merged_df = pd.merge(tcga_subtypes_data, hovernet_Predicted_NPIFs_TCGA_BRCA, left_on='sampleID', right_on='Slide_Name', how="inner")

# Drop the redundant Slide_Name column
merged_df.drop(columns=["Slide_Name"], inplace=True)

# Define output file path
output_path = f"{dataset_name}/outputs/HoverNet/Subtypes/"

# Ensure the directory exists
if not os.path.exists(output_path):
    os.makedirs(output_path, exist_ok=True)

# file_name for output file
file_name = "HoverNet_Original_NPIFs_Values_TCGA_BRCA_Mapped_BRCA_Status_Filtered_Tiles_Top25Q.csv"

output_file = os.path.join(output_path, file_name)

output_file


# Save merged data
merged_df.to_csv(output_file, index=False)

print(f"Mapped data saved to: {output_file}")
print("Done!")

merged_df


Mapped data saved to: TCGA_BRCA_FFPE/outputs/HoverNet/Subtypes/HoverNet_Original_NPIFs_Values_TCGA_BRCA_Mapped_BRCA_Status_Filtered_Tiles_Top25Q.csv
Done!


Unnamed: 0,sampleID,HER2_Status,PR_Status,ER_Status,Mean Area,Mean Major Axis,Mean Minor Axis,Mean Perimeter,Mean Eccentricity,Mean Circularity,Std Area,Std Major Axis,Std Minor Axis,Std Perimeter,Std Eccentricity,Std Circularity
0,TCGA-A1-A0SB,Negative,Negative,Positive,9.083610,4.279521,2.904360,12.250657,0.688904,0.743519,3.839664,1.068454,0.643313,2.922380,0.145530,0.102340
1,TCGA-A1-A0SD,Negative,Positive,Positive,10.750870,4.518914,3.215683,13.007940,0.653180,0.770368,4.950764,1.171824,0.748324,3.124821,0.153782,0.093280
2,TCGA-A1-A0SE,Negative,Positive,Positive,11.115128,4.606071,3.274291,13.224772,0.659490,0.773457,4.633991,1.085960,0.706866,2.899793,0.146859,0.086521
3,TCGA-A1-A0SF,Negative,Positive,Positive,12.963338,5.071167,3.436405,14.335088,0.690357,0.755218,7.114952,1.477987,0.931699,4.035264,0.145626,0.094786
4,TCGA-A1-A0SH,Negative,Positive,Negative,11.696231,4.794285,3.296823,13.702952,0.675181,0.751392,6.340335,1.430814,0.858550,3.874959,0.150895,0.104859
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
710,TCGA-EW-A1PB,Negative,Negative,Negative,12.469393,4.769945,3.442040,13.846007,0.643849,0.767721,6.768040,1.331533,0.991438,3.772418,0.160262,0.095114
711,TCGA-EW-A1PD,Positive,Positive,Positive,12.708835,5.079825,3.372532,14.732758,0.697955,0.698507,7.681434,1.624340,1.003553,4.582990,0.150906,0.118566
712,TCGA-EW-A1PE,Negative,Positive,Positive,14.269575,5.115578,3.686002,14.706263,0.648484,0.779478,6.709199,1.298417,0.964848,3.552653,0.154878,0.094389
713,TCGA-EW-A1PF,Negative,Positive,Positive,12.896830,5.023687,3.431680,14.381161,0.679632,0.740519,6.918545,1.459972,0.965221,3.939845,0.155834,0.107518
