In [1]:
#### -----------------------------------------------------------------------------------------------------------------------------------------
#### author: Ranjan Barman, date: Mar 11, 2025
#### Mapped HoverNet NPIFs to TCGA biomarker status with Age
#### ------------------------------------------------------------------------------------------------------------------------------------------
import os, sys, pickle, bz2
import numpy as np, pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from time import time 
from tqdm import tqdm

_wpath_ = "/data/Lab_ruppin/Ranjan/HnE/"        # set working directory as the parent directory where all datasets are saved
os.chdir(_wpath_)

print(f"working directory = {_wpath_}\n")

working directory = /data/Lab_ruppin/Ranjan/HnE/



In [2]:
# Define dataset and file paths
dataset_name = "TCGA_BRCA_FFPE"
outcome_names = ["HER2_Status", "PR_Status", "ER_Status"]


# File paths
hovernet_NPIFs_with_BRCA_status_file = f"{dataset_name}/outputs/HoverNet/Subtypes/HoverNet_Original_NPIFs_Values_TCGA_BRCA_Mapped_BRCA_Status_Filtered_Tiles_Top25Q.csv"

pathAI_TCGA_BRCA_survival_data_file = "PA_NUHIF_BRCA/data/PathAI_BRCA_MetaData.xlsx"

out_path = f"{dataset_name}/outputs/HoverNet/Survival_V2/"

os.makedirs(out_path, exist_ok=True) # Creates directory if it doesn't exist already

#read files 
pathAI_TCGA_BRCA_survival_data = pd.read_excel(pathAI_TCGA_BRCA_survival_data_file)

hovernet_NPIFs_with_BRCA_status = pd.read_csv(hovernet_NPIFs_with_BRCA_status_file)

# Remove trailing spaces from column names
hovernet_NPIFs_with_BRCA_status.columns = hovernet_NPIFs_with_BRCA_status.columns.str.strip()

# Rename the columns sample_id column
hovernet_NPIFs_with_BRCA_status = hovernet_NPIFs_with_BRCA_status.rename(columns={"sampleID": "sample_id"})

# Rename the columns in pathAI_TCGA_BRCA_survival_data
pathAI_TCGA_BRCA_survival_data = pathAI_TCGA_BRCA_survival_data.rename(columns={"age_at_initial_pathologic_diagnosis": "Age", "ajcc_pathologic_tumor_stage": "Stage"})

hovernet_NPIFs_with_BRCA_status
pathAI_TCGA_BRCA_survival_data



Unnamed: 0,H & E_ID,gender,vital_status,aneuploidy_score,hrd_score,genome_doublings,os,os_time,pfs,pfs_time,Age,Stage,bcr_patient_barcode,mpp,subtype
0,26524,FEMALE,Alive,0.0,0.0,0.0,0,8,0,8,30,Stage IIB,TCGA-PL-A8LY,0.2527,
1,75662,FEMALE,Dead,3.0,10.0,0.0,1,2009,1,2009,79,Stage I,TCGA-BH-A18S,0.2485,BRCA_LumA
2,80086,FEMALE,Alive,,,,0,446,0,446,62,Stage IIB,TCGA-A7-A4SC,0.2480,
3,80092,FEMALE,Alive,10.0,12.0,0.0,0,433,0,433,71,Stage IIA,TCGA-D8-A1Y2,0.2468,BRCA_LumB
4,80099,FEMALE,Alive,,,,0,385,0,385,55,Stage IIA,TCGA-C8-A12V,0.2485,BRCA_Basal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
881,91896,FEMALE,Alive,6.0,9.0,0.0,0,1150,0,1150,54,Stage IIB,TCGA-AR-A1AS,0.2468,BRCA_LumA
882,91897,FEMALE,Alive,11.0,17.0,1.0,0,317,0,317,38,Stage IIIA,TCGA-C8-A1HL,0.2456,BRCA_LumB
883,91901,FEMALE,Alive,,,,0,849,1,681,73,Stage IIIA,TCGA-LQ-A4E4,0.2456,
884,91908,FEMALE,Alive,3.0,11.0,0.0,0,172,0,172,32,Stage IA,TCGA-AN-A0FF,0.2480,BRCA_LumB


In [3]:
# Define the desired order for the specified columns
desired_columns = ['os', 'os_time', 'pfs', 'pfs_time', 'Age', 'Stage', 'bcr_patient_barcode']

# Apply the new column order to the DataFrame
pathAI_TCGA_BRCA_survival_data_with_desired_columns = pathAI_TCGA_BRCA_survival_data[desired_columns]
pathAI_TCGA_BRCA_survival_data_with_desired_columns


Unnamed: 0,os,os_time,pfs,pfs_time,Age,Stage,bcr_patient_barcode
0,0,8,0,8,30,Stage IIB,TCGA-PL-A8LY
1,1,2009,1,2009,79,Stage I,TCGA-BH-A18S
2,0,446,0,446,62,Stage IIB,TCGA-A7-A4SC
3,0,433,0,433,71,Stage IIA,TCGA-D8-A1Y2
4,0,385,0,385,55,Stage IIA,TCGA-C8-A12V
...,...,...,...,...,...,...,...
881,0,1150,0,1150,54,Stage IIB,TCGA-AR-A1AS
882,0,317,0,317,38,Stage IIIA,TCGA-C8-A1HL
883,0,849,1,681,73,Stage IIIA,TCGA-LQ-A4E4
884,0,172,0,172,32,Stage IA,TCGA-AN-A0FF


In [4]:
# 'sample_id' is the column name in df1 and 'Sample' is the column name in df2
# Adjust these column names to match the actual column names in your CSV files
merged_df1 = pd.merge(hovernet_NPIFs_with_BRCA_status, pathAI_TCGA_BRCA_survival_data_with_desired_columns, left_on='sample_id', right_on='bcr_patient_barcode', how='inner')

# Drop the redundant 'bcr_patient_barcode' column
merged_df1.drop(columns='bcr_patient_barcode', inplace=True)

merged_df1

Unnamed: 0,sample_id,HER2_Status,PR_Status,ER_Status,Mean Area,Mean Major Axis,Mean Minor Axis,Mean Perimeter,Mean Eccentricity,Mean Circularity,...,Std Minor Axis,Std Perimeter,Std Eccentricity,Std Circularity,os,os_time,pfs,pfs_time,Age,Stage
0,TCGA-A1-A0SB,Negative,Negative,Positive,9.083610,4.279521,2.904360,12.250657,0.688904,0.743519,...,0.643313,2.922380,0.145530,0.102340,0,259,0,259,70,Stage I
1,TCGA-A1-A0SD,Negative,Positive,Positive,10.750870,4.518914,3.215683,13.007940,0.653180,0.770368,...,0.748324,3.124821,0.153782,0.093280,0,437,0,437,59,Stage IIA
2,TCGA-A1-A0SE,Negative,Positive,Positive,11.115128,4.606071,3.274291,13.224772,0.659490,0.773457,...,0.706866,2.899793,0.146859,0.086521,0,1321,0,1321,56,Stage I
3,TCGA-A1-A0SF,Negative,Positive,Positive,12.963338,5.071167,3.436405,14.335088,0.690357,0.755218,...,0.931699,4.035264,0.145626,0.094786,0,1463,0,1463,54,Stage IIA
4,TCGA-A1-A0SH,Negative,Positive,Negative,11.696231,4.794285,3.296823,13.702952,0.675181,0.751392,...,0.858550,3.874959,0.150895,0.104859,0,1437,0,1437,39,Stage IIA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
567,TCGA-EW-A1PB,Negative,Negative,Negative,12.469393,4.769945,3.442040,13.846007,0.643849,0.767721,...,0.991438,3.772418,0.160262,0.095114,0,608,0,608,70,Stage IIIA
568,TCGA-EW-A1PD,Positive,Positive,Positive,12.708835,5.079825,3.372532,14.732758,0.697955,0.698507,...,1.003553,4.582990,0.150906,0.118566,0,424,0,424,61,Stage IIA
569,TCGA-EW-A1PE,Negative,Positive,Positive,14.269575,5.115578,3.686002,14.706263,0.648484,0.779478,...,0.964848,3.552653,0.154878,0.094389,0,320,0,320,56,Stage IIA
570,TCGA-EW-A1PF,Negative,Positive,Positive,12.896830,5.023687,3.431680,14.381161,0.679632,0.740519,...,0.965221,3.939845,0.155834,0.107518,0,439,0,439,50,Stage IA


In [5]:
# The number of samples used for TCGA BRCA subtype prediction 
TCGA_BRCA_Subtypes_556_file = f"{dataset_name}/outputs/HoverNet/Subtypes/outputs_biomarker_status_prediction_results_All_HoverNet_NPIFs/combined_class_predictions_all_features_for_subtypes.csv"

TCGA_BRCA_Subtypes_556 = pd.read_csv(TCGA_BRCA_Subtypes_556_file)

# Convert the first column ("sample") to a series
sample_list_556 = TCGA_BRCA_Subtypes_556['sample_id'].reset_index(drop=True)

# Filter sample based on `sample_list_556`
merged_df1_filtered = merged_df1[merged_df1['sample_id'].isin(sample_list_556)].copy()
merged_df1_filtered


Unnamed: 0,sample_id,HER2_Status,PR_Status,ER_Status,Mean Area,Mean Major Axis,Mean Minor Axis,Mean Perimeter,Mean Eccentricity,Mean Circularity,...,Std Minor Axis,Std Perimeter,Std Eccentricity,Std Circularity,os,os_time,pfs,pfs_time,Age,Stage
0,TCGA-A1-A0SB,Negative,Negative,Positive,9.083610,4.279521,2.904360,12.250657,0.688904,0.743519,...,0.643313,2.922380,0.145530,0.102340,0,259,0,259,70,Stage I
1,TCGA-A1-A0SD,Negative,Positive,Positive,10.750870,4.518914,3.215683,13.007940,0.653180,0.770368,...,0.748324,3.124821,0.153782,0.093280,0,437,0,437,59,Stage IIA
2,TCGA-A1-A0SE,Negative,Positive,Positive,11.115128,4.606071,3.274291,13.224772,0.659490,0.773457,...,0.706866,2.899793,0.146859,0.086521,0,1321,0,1321,56,Stage I
3,TCGA-A1-A0SF,Negative,Positive,Positive,12.963338,5.071167,3.436405,14.335088,0.690357,0.755218,...,0.931699,4.035264,0.145626,0.094786,0,1463,0,1463,54,Stage IIA
4,TCGA-A1-A0SH,Negative,Positive,Negative,11.696231,4.794285,3.296823,13.702952,0.675181,0.751392,...,0.858550,3.874959,0.150895,0.104859,0,1437,0,1437,39,Stage IIA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
567,TCGA-EW-A1PB,Negative,Negative,Negative,12.469393,4.769945,3.442040,13.846007,0.643849,0.767721,...,0.991438,3.772418,0.160262,0.095114,0,608,0,608,70,Stage IIIA
568,TCGA-EW-A1PD,Positive,Positive,Positive,12.708835,5.079825,3.372532,14.732758,0.697955,0.698507,...,1.003553,4.582990,0.150906,0.118566,0,424,0,424,61,Stage IIA
569,TCGA-EW-A1PE,Negative,Positive,Positive,14.269575,5.115578,3.686002,14.706263,0.648484,0.779478,...,0.964848,3.552653,0.154878,0.094389,0,320,0,320,56,Stage IIA
570,TCGA-EW-A1PF,Negative,Positive,Positive,12.896830,5.023687,3.431680,14.381161,0.679632,0.740519,...,0.965221,3.939845,0.155834,0.107518,0,439,0,439,50,Stage IA


In [6]:
# Path for the output CSV file
BrcaBiomarkerStatusToHoverNet_NPIFsToSurvival = f"{out_path}BrcaBiomarkerStatusToHoverNet_NPIFsToSurvival.csv"

# Write the merged dataframe to a new CSV file
merged_df1_filtered.to_csv(BrcaBiomarkerStatusToHoverNet_NPIFsToSurvival, index=False)

print("The files have been mapped and saved to:", BrcaBiomarkerStatusToHoverNet_NPIFsToSurvival)
print("Done!")

The files have been mapped and saved to: TCGA_BRCA_FFPE/outputs/HoverNet/Survival_V2/BrcaBiomarkerStatusToHoverNet_NPIFsToSurvival.csv
Done!
