In [4]:
#### ------------------------------------------------------------------------------------------
#### author: Ranjan Barman, date: Feb 18, 2025
#### Mapped HoverNet NPIFs to TCGA_BRCA subtypes status (all subtype status) 
#### --------------------------------------------------------------------------------------------

import os
import pandas as pd

# Set working directory
_wpath_ = "/data/Lab_ruppin/Ranjan/HnE/"
os.chdir(_wpath_)

print(f"Working directory: {_wpath_}\n")

# Define dataset and file paths
dataset_name = "TCGA_BRCA_FFPE"

# File paths
hovernet_Predicted_NPIFs_TCGA_BRCA_file = f"{dataset_name}/outputs/HoverNet/HoverNet_NPIFs_TCGA_BRCA_1106.csv"
tcga_subtypes_file = "PA_HIF_BRCA/outputs_clinical/TCGA_BRCA_Subtypes_clinical.tsv"

# Load the original NPIFs values data
hovernet_Predicted_NPIFs_TCGA_BRCA = pd.read_csv(hovernet_Predicted_NPIFs_TCGA_BRCA_file)

# Remove trailing spaces from column names
hovernet_Predicted_NPIFs_TCGA_BRCA.columns = hovernet_Predicted_NPIFs_TCGA_BRCA.columns.str.strip()

# Extract the first 12 characters of Slide_Name to create a new Sample_ID column
hovernet_Predicted_NPIFs_TCGA_BRCA["Slide_Name"] = hovernet_Predicted_NPIFs_TCGA_BRCA["Slide_Name"].str[:12]

# Drop duplicate Slide_Name in hovernet_Predicted_NPIFs_TCGA_BRCA (keep the first occurrence)
hovernet_Predicted_NPIFs_TCGA_BRCA = hovernet_Predicted_NPIFs_TCGA_BRCA.drop_duplicates(subset=['Slide_Name'], keep='first')


# Load the TCGA BRCA subtype data
tcga_subtypes_data = pd.read_table(tcga_subtypes_file, sep="\t")

# Keep only the relevant columns from TCGA subtypes
tcga_subtypes_data = tcga_subtypes_data[["sampleID", "HER2_Final_Status_nature2012", "PR_Status_nature2012", "ER_Status_nature2012"]]

# Rename columns to match the expected format
tcga_subtypes_data = tcga_subtypes_data.rename(columns={
    "HER2_Final_Status_nature2012": "HER2_Status",
    "PR_Status_nature2012": "PR_Status",
    "ER_Status_nature2012": "ER_Status"
})

# Remove '-01' suffix from Sample_ID to match NPIFs data
tcga_subtypes_data["sampleID"] = tcga_subtypes_data["sampleID"].str.replace('-01', '', regex=False)

hovernet_Predicted_NPIFs_TCGA_BRCA
# tcga_subtypes_data


Working directory: /data/Lab_ruppin/Ranjan/HnE/



Unnamed: 0,Slide_Name,Mean Area,Mean Major Axis,Mean Minor Axis,Mean Perimeter,Mean Eccentricity,Mean Circularity,Std Area,Std Major Axis,Std Minor Axis,Std Perimeter,Std Eccentricity,Std Circularity
0,TCGA-D8-A13Z,14.833710,5.506910,3.551186,15.416370,0.715561,0.726959,8.730753,1.757182,1.117617,4.616545,0.149701,0.099877
1,TCGA-AR-A0TR,10.636613,4.671437,3.086942,13.165565,0.705367,0.734036,5.749400,1.315593,0.872088,3.614005,0.148422,0.100828
2,TCGA-D8-A1JN,9.969764,4.350092,3.102966,12.481813,0.656248,0.781682,3.748079,0.941256,0.639748,2.473921,0.150293,0.086766
3,TCGA-A2-A0ES,11.153084,4.567200,3.296139,13.127766,0.646566,0.784621,4.801563,1.108456,0.731919,2.965663,0.149658,0.086852
4,TCGA-C8-A12U,12.727178,4.966165,3.406089,14.235603,0.682669,0.740320,7.040320,1.393509,1.042967,3.973247,0.156266,0.114998
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1098,TCGA-A2-A4RX,12.221909,4.906013,3.294964,13.869386,0.688448,0.747490,7.354571,1.583873,0.972600,4.206819,0.152525,0.098693
1099,TCGA-BH-A0AY,12.060166,4.994413,3.240219,13.998588,0.711572,0.729147,7.132281,1.590212,0.966955,4.229923,0.148516,0.101088
1101,TCGA-BH-A18S,10.569445,4.515766,3.179218,12.963364,0.660976,0.768189,4.505075,1.148057,0.666009,3.010046,0.149184,0.092265
1103,TCGA-EW-A1IX,8.448508,4.211481,2.764026,11.900906,0.699656,0.734733,4.677035,1.352918,0.722392,3.724168,0.150583,0.124417


In [5]:
# Merge NPIFs with TCGA subtypes data on Sample_ID
merged_df = pd.merge(tcga_subtypes_data, hovernet_Predicted_NPIFs_TCGA_BRCA, left_on='sampleID', right_on='Slide_Name', how="inner")

# Drop the redundant Slide_Name column
merged_df.drop(columns=["Slide_Name"], inplace=True)

# Define output file path
output_path = f"{dataset_name}/outputs/HoverNet/Subtypes/"

# Ensure the directory exists
if not os.path.exists(output_path):
    os.makedirs(output_path, exist_ok=True)

# file_name for output file
file_name = "HoverNet_Original_NPIFs_Values_TCGA_BRCA_Mapped_BRCA_Status.csv"

output_file = os.path.join(output_path, file_name)

output_file


# Save merged data
merged_df.to_csv(output_file, index=False)

print(f"Mapped data saved to: {output_file}")
print("Done!")

merged_df


Mapped data saved to: TCGA_BRCA_FFPE/outputs/HoverNet/Subtypes/HoverNet_Original_NPIFs_Values_TCGA_BRCA_Mapped_BRCA_Status.csv
Done!


Unnamed: 0,sampleID,HER2_Status,PR_Status,ER_Status,Mean Area,Mean Major Axis,Mean Minor Axis,Mean Perimeter,Mean Eccentricity,Mean Circularity,Std Area,Std Major Axis,Std Minor Axis,Std Perimeter,Std Eccentricity,Std Circularity
0,TCGA-A1-A0SB,Negative,Negative,Positive,9.195387,4.321033,2.915560,12.345927,0.691851,0.740468,4.043874,1.111174,0.666327,3.044964,0.146049,0.103458
1,TCGA-A1-A0SD,Negative,Positive,Positive,10.990767,4.598732,3.237783,13.198198,0.660687,0.764008,5.337735,1.237818,0.782514,3.338023,0.153062,0.097086
2,TCGA-A1-A0SE,Negative,Positive,Positive,11.248593,4.661373,3.277948,13.351101,0.666076,0.766299,5.136734,1.180780,0.762165,3.218593,0.147333,0.092280
3,TCGA-A1-A0SF,Negative,Positive,Positive,13.471537,5.184088,3.504886,14.653101,0.691533,0.749764,7.607367,1.549759,0.976177,4.244868,0.145507,0.096608
4,TCGA-A1-A0SH,Negative,Positive,Negative,12.127593,4.886405,3.354515,13.920132,0.676842,0.753368,6.608414,1.466462,0.876198,3.939578,0.149730,0.102938
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
710,TCGA-EW-A1PB,Negative,Negative,Negative,12.412278,4.800650,3.416715,13.862514,0.654321,0.764706,6.683823,1.328402,0.977391,3.737845,0.159393,0.095545
711,TCGA-EW-A1PD,Positive,Positive,Positive,12.608980,5.084656,3.349750,14.658306,0.702925,0.699644,7.695761,1.644407,1.011241,4.622193,0.149923,0.117395
712,TCGA-EW-A1PE,Negative,Positive,Positive,13.904235,5.093487,3.613989,14.562767,0.658886,0.773454,6.785858,1.343361,0.977925,3.650983,0.154884,0.097561
713,TCGA-EW-A1PF,Negative,Positive,Positive,12.666117,4.996555,3.389470,14.254749,0.683962,0.739705,6.951075,1.480680,0.971618,3.997046,0.155604,0.107671
