In [19]:
#### -----------------------------------------------------------------------------------------------------------------------------------------
#### author: Ranjan Barman, date: May 5, 2025
#### Mapped PathAI nuHIFs to TCGA biomarker status with Age
#### ------------------------------------------------------------------------------------------------------------------------------------------
import os, sys, pickle, bz2
import numpy as np, pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from time import time 
from tqdm import tqdm

_wpath_ = "/data/Lab_ruppin/Ranjan/HnE/"        # set working directory as the parent directory where all datasets are saved
os.chdir(_wpath_)

print(f"working directory = {_wpath_}\n")

working directory = /data/Lab_ruppin/Ranjan/HnE/



In [20]:
# Define dataset and file paths
dataset_name = "TCGA_BRCA_FFPE"
outcome_names = ["HER2_Status", "PR_Status", "ER_Status"]


# File paths
PathAI_HIFs_with_BRCA_status_file = "PA_PIF_BRCA/outputs_biomarker_status/tcga_brca_subtype_to_original_pifs.csv"

pathAI_TCGA_BRCA_survival_data_file = "PA_NUHIF_BRCA/data/PathAI_BRCA_MetaData.xlsx"

out_path = f"{dataset_name}/outputs/PathAI/Survival_V2/"

os.makedirs(out_path, exist_ok=True) # Creates directory if it doesn't exist already

# Load data
data_full = pd.read_csv(PathAI_HIFs_with_BRCA_status_file)
patient_list = data_full['sample_id'].reset_index(drop=True)
data_full
data_full


# Filter to include only rows where outcome is "Positive" or "Negative"
data_filtered = data_full[data_full[outcome_names].isin(["Positive", "Negative"]).all(axis=1)].copy() # Use .copy() to avoid SettingWithCopyWarning
filtered_patient_list = patient_list[data_filtered.index]

# Create a new DataFrame for original nuHIFs with proper status
data_orginal_HIFs = data_filtered.copy()  # Keep a copy of the filtered data before dropping columns

data_orginal_HIFs




# The number of samples used for TCGA BRCA subtype prediction 
TCGA_BRCA_Subtypes_556_file = f"{dataset_name}/outputs/HoverNet/Subtypes/outputs_biomarker_status_prediction_results_All_HoverNet_NPIFs/combined_class_predictions_all_features_for_subtypes.csv"

TCGA_BRCA_Subtypes_556 = pd.read_csv(TCGA_BRCA_Subtypes_556_file)

# Convert the first column ("sample") to a series
sample_list_556 = TCGA_BRCA_Subtypes_556['sample_id'].reset_index(drop=True)

# Filter sample based on `sample_list_556`
PathAI_HIFs_with_BRCA_status = data_orginal_HIFs[data_orginal_HIFs['sample_id'].isin(sample_list_556)].copy()
PathAI_HIFs_with_BRCA_status


#read files 
pathAI_TCGA_BRCA_survival_data = pd.read_excel(pathAI_TCGA_BRCA_survival_data_file)



# Rename the columns in pathAI_TCGA_BRCA_survival_data
pathAI_TCGA_BRCA_survival_data = pathAI_TCGA_BRCA_survival_data.rename(columns={"age_at_initial_pathologic_diagnosis": "Age", "ajcc_pathologic_tumor_stage": "Stage"})

PathAI_HIFs_with_BRCA_status
# pathAI_TCGA_BRCA_survival_data



Unnamed: 0,sample_id,HER2_Status,PR_Status,ER_Status,MEAN[CANCER_NUCLEUS_AREA]_H & E,MEAN[CANCER_NUCLEUS_MAJOR_AXIS_LENGTH]_H & E,MEAN[CANCER_NUCLEUS_MINOR_AXIS_LENGTH]_H & E,MEAN[CANCER_NUCLEUS_PERIMETER]_H & E,MEAN[CANCER_NUCLEUS_CIRCULARITY]_H & E,MEAN[CANCER_NUCLEUS_ECCENTRICITY]_H & E,...,AREA PROP [[ESI_0080] OVER [TUMOR]] IN [TISSUE]_HE,AREA PROP [[STROMA] OVER [TUMOR]] IN [TISSUE]_HE,REGION PROPERTIES: AVERAGE ECCENTRICITY OF SIGNIFICANT REGIONS OF TUMOR_HE,REGION PROPERTIES: AVERAGE SOLIDITY OF SIGNIFICANT REGIONS OF TUMOR_HE,REGION PROPERTIES: ECCENTRICITY OF LARGEST REGION OF TUMOR_HE,STD[CANCER_NUCLEUS_MIN_GRAYSCALE_CHANNEL_GRAY]_H & E,STD[CANCER_NUCLEUS_MIN_HSV_CHANNEL_SATURATION]_H & E,REGION PROPERTIES: FILLED AREA (MM2) OF LARGEST REGION OF TUMOR_HE,REGION PROPERTIES: LACUNARITY OF LARGEST REGION OF TUMOR_HE,REGION PROPERTIES: LACUNARITY OF TUMOR_HE
0,TCGA-A1-A0SB,Negative,Negative,Positive,36.561916,8.579009,5.781790,23.836950,0.789023,0.697108,...,0.465021,0.678288,0.652634,0.421135,0.825884,0.057112,0.052104,6.726608,1.521139,2.439853
1,TCGA-A1-A0SD,Negative,Positive,Positive,42.765987,9.012733,6.361746,25.581022,0.795899,0.660698,...,0.609570,0.694431,0.767243,0.440930,0.771394,0.064148,0.058635,10.915200,1.377035,1.752753
2,TCGA-A1-A0SE,Negative,Positive,Positive,45.249550,9.253537,6.567734,26.291719,0.801859,0.661877,...,0.739094,0.618575,0.740349,0.392250,0.818779,0.067092,0.047699,11.227888,1.383636,2.312483
3,TCGA-A1-A0SF,Negative,Positive,Positive,47.735512,9.827610,6.466487,27.548254,0.760771,0.709445,...,0.414285,0.951365,0.768185,0.417708,0.625082,0.059546,0.074203,6.866320,1.423826,1.282133
4,TCGA-A1-A0SH,Negative,Positive,Negative,46.371204,9.408572,6.615330,26.852484,0.788655,0.663869,...,0.734813,0.676047,0.857859,0.477949,0.903828,0.041100,0.079935,1.243360,1.423946,1.735129
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
572,TCGA-EW-A1PB,Negative,Negative,Negative,45.677532,9.100985,6.613077,26.061266,0.804127,0.643324,...,0.637076,0.283918,0.520623,0.715467,0.520623,0.044723,0.060789,148.351440,0.958411,1.340844
573,TCGA-EW-A1PD,Positive,Positive,Positive,45.869053,9.275162,6.557035,26.689734,0.777331,0.662018,...,0.615782,0.656929,0.861209,0.498304,0.926991,0.050780,0.084191,14.192448,1.230334,2.343512
574,TCGA-EW-A1PE,Negative,Positive,Positive,58.831284,10.317983,7.521458,29.459068,0.822693,0.643314,...,0.683994,0.455836,0.688806,0.739581,0.688806,0.057292,0.050407,190.986032,0.805378,1.067564
575,TCGA-EW-A1PF,Negative,Positive,Positive,52.917530,9.815355,7.153934,28.218414,0.807872,0.642525,...,0.785701,0.688055,0.613136,0.623601,0.613136,0.051204,0.067629,63.300064,1.226349,2.544999


In [21]:
# Define the desired order for the specified columns
desired_columns = ['os', 'os_time', 'pfs', 'pfs_time', 'Age', 'Stage', 'bcr_patient_barcode']

# Apply the new column order to the DataFrame
pathAI_TCGA_BRCA_survival_data_with_desired_columns = pathAI_TCGA_BRCA_survival_data[desired_columns]
pathAI_TCGA_BRCA_survival_data_with_desired_columns


Unnamed: 0,os,os_time,pfs,pfs_time,Age,Stage,bcr_patient_barcode
0,0,8,0,8,30,Stage IIB,TCGA-PL-A8LY
1,1,2009,1,2009,79,Stage I,TCGA-BH-A18S
2,0,446,0,446,62,Stage IIB,TCGA-A7-A4SC
3,0,433,0,433,71,Stage IIA,TCGA-D8-A1Y2
4,0,385,0,385,55,Stage IIA,TCGA-C8-A12V
...,...,...,...,...,...,...,...
881,0,1150,0,1150,54,Stage IIB,TCGA-AR-A1AS
882,0,317,0,317,38,Stage IIIA,TCGA-C8-A1HL
883,0,849,1,681,73,Stage IIIA,TCGA-LQ-A4E4
884,0,172,0,172,32,Stage IA,TCGA-AN-A0FF


In [22]:
# 'sample_id' is the column name in df1 and 'Sample' is the column name in df2
# Adjust these column names to match the actual column names in your CSV files
merged_df1 = pd.merge(PathAI_HIFs_with_BRCA_status, pathAI_TCGA_BRCA_survival_data_with_desired_columns, left_on='sample_id', right_on='bcr_patient_barcode', how='inner')

# Drop the redundant 'bcr_patient_barcode' column
merged_df1.drop(columns='bcr_patient_barcode', inplace=True)

merged_df1



Unnamed: 0,sample_id,HER2_Status,PR_Status,ER_Status,MEAN[CANCER_NUCLEUS_AREA]_H & E,MEAN[CANCER_NUCLEUS_MAJOR_AXIS_LENGTH]_H & E,MEAN[CANCER_NUCLEUS_MINOR_AXIS_LENGTH]_H & E,MEAN[CANCER_NUCLEUS_PERIMETER]_H & E,MEAN[CANCER_NUCLEUS_CIRCULARITY]_H & E,MEAN[CANCER_NUCLEUS_ECCENTRICITY]_H & E,...,STD[CANCER_NUCLEUS_MIN_HSV_CHANNEL_SATURATION]_H & E,REGION PROPERTIES: FILLED AREA (MM2) OF LARGEST REGION OF TUMOR_HE,REGION PROPERTIES: LACUNARITY OF LARGEST REGION OF TUMOR_HE,REGION PROPERTIES: LACUNARITY OF TUMOR_HE,os,os_time,pfs,pfs_time,Age,Stage
0,TCGA-A1-A0SB,Negative,Negative,Positive,36.561916,8.579009,5.781790,23.836950,0.789023,0.697108,...,0.052104,6.726608,1.521139,2.439853,0,259,0,259,70,Stage I
1,TCGA-A1-A0SD,Negative,Positive,Positive,42.765987,9.012733,6.361746,25.581022,0.795899,0.660698,...,0.058635,10.915200,1.377035,1.752753,0,437,0,437,59,Stage IIA
2,TCGA-A1-A0SE,Negative,Positive,Positive,45.249550,9.253537,6.567734,26.291719,0.801859,0.661877,...,0.047699,11.227888,1.383636,2.312483,0,1321,0,1321,56,Stage I
3,TCGA-A1-A0SF,Negative,Positive,Positive,47.735512,9.827610,6.466487,27.548254,0.760771,0.709445,...,0.074203,6.866320,1.423826,1.282133,0,1463,0,1463,54,Stage IIA
4,TCGA-A1-A0SH,Negative,Positive,Negative,46.371204,9.408572,6.615330,26.852484,0.788655,0.663869,...,0.079935,1.243360,1.423946,1.735129,0,1437,0,1437,39,Stage IIA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551,TCGA-EW-A1PB,Negative,Negative,Negative,45.677532,9.100985,6.613077,26.061266,0.804127,0.643324,...,0.060789,148.351440,0.958411,1.340844,0,608,0,608,70,Stage IIIA
552,TCGA-EW-A1PD,Positive,Positive,Positive,45.869053,9.275162,6.557035,26.689734,0.777331,0.662018,...,0.084191,14.192448,1.230334,2.343512,0,424,0,424,61,Stage IIA
553,TCGA-EW-A1PE,Negative,Positive,Positive,58.831284,10.317983,7.521458,29.459068,0.822693,0.643314,...,0.050407,190.986032,0.805378,1.067564,0,320,0,320,56,Stage IIA
554,TCGA-EW-A1PF,Negative,Positive,Positive,52.917530,9.815355,7.153934,28.218414,0.807872,0.642525,...,0.067629,63.300064,1.226349,2.544999,0,439,0,439,50,Stage IA


In [23]:
# Path for the output CSV file
BrcaBiomarkerStatusToPathAI_PIFsToSurvival = f"{out_path}BrcaBiomarkerStatusToPathAI_PIFsToSurvival.csv"

# Write the merged dataframe to a new CSV file
merged_df1.to_csv(BrcaBiomarkerStatusToPathAI_PIFsToSurvival, index=False)

print("The files have been mapped and saved to:", BrcaBiomarkerStatusToPathAI_PIFsToSurvival)
print("Done!")

The files have been mapped and saved to: TCGA_BRCA_FFPE/outputs/PathAI/Survival_V2/BrcaBiomarkerStatusToPathAI_PIFsToSurvival.csv
Done!
