In [19]:
#### -----------------------------------------------------------------------------------------------------------------------------------------
#### author: Ranjan Barman, date: May 5, 2025
#### Mapped PathAI nuHIFs to TCGA biomarker status with Age
#### ------------------------------------------------------------------------------------------------------------------------------------------
import os, sys, pickle, bz2
import numpy as np, pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from time import time 
from tqdm import tqdm

_wpath_ = "/data/Lab_ruppin/Ranjan/HnE/"        # set working directory as the parent directory where all datasets are saved
os.chdir(_wpath_)

print(f"working directory = {_wpath_}\n")

working directory = /data/Lab_ruppin/Ranjan/HnE/



In [20]:
# Define dataset and file paths
dataset_name = "TCGA_BRCA_FFPE"
outcome_names = ["HER2_Status", "PR_Status", "ER_Status"]


# File paths
PathAI_HIFs_with_BRCA_status_file = "PA_NUHIF_BRCA/outputs_biomarker_status/tcga_brca_subtype_to_original_nuhifs.csv"

pathAI_TCGA_BRCA_survival_data_file = "PA_NUHIF_BRCA/data/PathAI_BRCA_MetaData.xlsx"

out_path = f"{dataset_name}/outputs/PathAI/Survival_V2/"

os.makedirs(out_path, exist_ok=True) # Creates directory if it doesn't exist already

# Load data
data_full = pd.read_csv(PathAI_HIFs_with_BRCA_status_file)
patient_list = data_full['sample_id'].reset_index(drop=True)
data_full
data_full


# Filter to include only rows where outcome is "Positive" or "Negative"
data_filtered = data_full[data_full[outcome_names].isin(["Positive", "Negative"]).all(axis=1)].copy() # Use .copy() to avoid SettingWithCopyWarning
filtered_patient_list = patient_list[data_filtered.index]

# Create a new DataFrame for original nuHIFs with proper status
data_orginal_HIFs = data_filtered.copy()  # Keep a copy of the filtered data before dropping columns

data_orginal_HIFs




# The number of samples used for TCGA BRCA subtype prediction 
TCGA_BRCA_Subtypes_556_file = f"{dataset_name}/outputs/HoverNet/Subtypes/outputs_biomarker_status_prediction_results_All_HoverNet_NPIFs/combined_class_predictions_all_features_for_subtypes.csv"

TCGA_BRCA_Subtypes_556 = pd.read_csv(TCGA_BRCA_Subtypes_556_file)

# Convert the first column ("sample") to a series
sample_list_556 = TCGA_BRCA_Subtypes_556['sample_id'].reset_index(drop=True)

# Filter sample based on `sample_list_556`
PathAI_HIFs_with_BRCA_status = data_orginal_HIFs[data_orginal_HIFs['sample_id'].isin(sample_list_556)].copy()
PathAI_HIFs_with_BRCA_status

#read files 
pathAI_TCGA_BRCA_survival_data = pd.read_excel(pathAI_TCGA_BRCA_survival_data_file)



# Rename the columns in pathAI_TCGA_BRCA_survival_data
pathAI_TCGA_BRCA_survival_data = pathAI_TCGA_BRCA_survival_data.rename(columns={"age_at_initial_pathologic_diagnosis": "Age", "ajcc_pathologic_tumor_stage": "Stage"})

PathAI_HIFs_with_BRCA_status
# pathAI_TCGA_BRCA_survival_data



Unnamed: 0,sample_id,HER2_Status,PR_Status,ER_Status,BRCA_Subtypes,MEAN[CANCER_NUCLEUS_AREA]_H & E,MEAN[CANCER_NUCLEUS_CIRCULARITY]_H & E,MEAN[CANCER_NUCLEUS_ECCENTRICITY]_H & E,MEAN[CANCER_NUCLEUS_MAJOR_AXIS_LENGTH]_H & E,MEAN[CANCER_NUCLEUS_MEAN_LAB_CHANNEL_A]_H & E,...,STD[LYMPHOCYTE_NUCLEUS_MEAN_LAB_CHANNEL_B]_H & E,STD[LYMPHOCYTE_NUCLEUS_MINOR_AXIS_LENGTH]_H & E,STD[LYMPHOCYTE_NUCLEUS_MIN_GRAYSCALE_CHANNEL_GRAY]_H & E,STD[LYMPHOCYTE_NUCLEUS_MIN_HSV_CHANNEL_SATURATION]_H & E,STD[LYMPHOCYTE_NUCLEUS_PERIMETER]_H & E,STD[LYMPHOCYTE_NUCLEUS_SOLIDITY]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_GRAYSCALE_CHANNEL_GRAY]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_HSV_CHANNEL_SATURATION]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_LAB_CHANNEL_A]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_LAB_CHANNEL_B]_H & E
0,TCGA-D8-A1Y2,Negative,Positive,Positive,HR Positive,34.365021,0.788894,0.682284,8.251385,0.602840,...,0.007224,0.801812,0.051126,0.065717,3.289017,0.019602,0.020905,0.020543,0.003352,0.004804
1,TCGA-C8-A12V,Negative,Negative,Negative,TNBC,39.071213,0.784081,0.686660,8.686616,0.600544,...,0.017781,0.727386,0.057589,0.072681,3.023650,0.019152,0.021876,0.021078,0.004379,0.006619
2,TCGA-EW-A1P6,Negative,Positive,Positive,HR Positive,38.241444,0.810570,0.643006,8.426938,0.606436,...,0.008084,0.759774,0.039343,0.064644,2.844278,0.016114,0.020894,0.018527,0.004438,0.006021
4,TCGA-E2-A15S,Negative,Negative,Positive,HR Positive,37.934071,0.813011,0.654944,8.453368,0.610344,...,0.007589,0.618312,0.048938,0.055562,2.364664,0.012161,0.020599,0.023713,0.003215,0.005068
5,TCGA-D8-A1X5,Positive,Positive,Positive,HER2 Positive,49.571533,0.811955,0.643151,9.492211,0.578170,...,0.010493,0.701528,0.066782,0.043207,2.680271,0.015533,0.019769,0.019458,0.003928,0.005174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
577,TCGA-A2-A04N,Negative,Positive,Positive,HR Positive,40.167149,0.786355,0.661289,8.731260,0.652192,...,0.006646,0.706071,0.044662,0.079656,2.822683,0.016338,0.021173,0.020896,0.004519,0.003546
578,TCGA-AR-A1AS,Negative,Positive,Positive,HR Positive,47.793850,0.759587,0.734216,10.062342,0.616858,...,0.008048,0.731268,0.040083,0.078726,3.091555,0.018503,0.022948,0.024541,0.003706,0.005923
579,TCGA-C8-A1HL,Positive,Negative,Positive,HER2 Positive,48.468864,0.763141,0.713281,9.954347,0.607404,...,0.013180,0.781547,0.060307,0.072402,3.213238,0.018057,0.022812,0.023739,0.004350,0.006741
580,TCGA-AN-A0FF,Negative,Positive,Positive,HR Positive,33.570347,0.766258,0.702174,8.304225,0.594162,...,0.007930,0.731670,0.033249,0.076339,3.251431,0.021967,0.022200,0.018067,0.003412,0.007163


In [21]:
# Define the desired order for the specified columns
desired_columns = ['os', 'os_time', 'pfs', 'pfs_time', 'Age', 'Stage', 'bcr_patient_barcode']

# Apply the new column order to the DataFrame
pathAI_TCGA_BRCA_survival_data_with_desired_columns = pathAI_TCGA_BRCA_survival_data[desired_columns]
pathAI_TCGA_BRCA_survival_data_with_desired_columns


Unnamed: 0,os,os_time,pfs,pfs_time,Age,Stage,bcr_patient_barcode
0,0,8,0,8,30,Stage IIB,TCGA-PL-A8LY
1,1,2009,1,2009,79,Stage I,TCGA-BH-A18S
2,0,446,0,446,62,Stage IIB,TCGA-A7-A4SC
3,0,433,0,433,71,Stage IIA,TCGA-D8-A1Y2
4,0,385,0,385,55,Stage IIA,TCGA-C8-A12V
...,...,...,...,...,...,...,...
881,0,1150,0,1150,54,Stage IIB,TCGA-AR-A1AS
882,0,317,0,317,38,Stage IIIA,TCGA-C8-A1HL
883,0,849,1,681,73,Stage IIIA,TCGA-LQ-A4E4
884,0,172,0,172,32,Stage IA,TCGA-AN-A0FF


In [22]:
# 'sample_id' is the column name in df1 and 'Sample' is the column name in df2
# Adjust these column names to match the actual column names in your CSV files
merged_df1 = pd.merge(PathAI_HIFs_with_BRCA_status, pathAI_TCGA_BRCA_survival_data_with_desired_columns, left_on='sample_id', right_on='bcr_patient_barcode', how='inner')

# Drop the redundant 'bcr_patient_barcode' column
merged_df1.drop(columns='bcr_patient_barcode', inplace=True)

merged_df1

Unnamed: 0,sample_id,HER2_Status,PR_Status,ER_Status,BRCA_Subtypes,MEAN[CANCER_NUCLEUS_AREA]_H & E,MEAN[CANCER_NUCLEUS_CIRCULARITY]_H & E,MEAN[CANCER_NUCLEUS_ECCENTRICITY]_H & E,MEAN[CANCER_NUCLEUS_MAJOR_AXIS_LENGTH]_H & E,MEAN[CANCER_NUCLEUS_MEAN_LAB_CHANNEL_A]_H & E,...,STD[LYMPHOCYTE_NUCLEUS_STD_GRAYSCALE_CHANNEL_GRAY]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_HSV_CHANNEL_SATURATION]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_LAB_CHANNEL_A]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_LAB_CHANNEL_B]_H & E,os,os_time,pfs,pfs_time,Age,Stage
0,TCGA-D8-A1Y2,Negative,Positive,Positive,HR Positive,34.365021,0.788894,0.682284,8.251385,0.602840,...,0.020905,0.020543,0.003352,0.004804,0,433,0,433,71,Stage IIA
1,TCGA-C8-A12V,Negative,Negative,Negative,TNBC,39.071213,0.784081,0.686660,8.686616,0.600544,...,0.021876,0.021078,0.004379,0.006619,0,385,0,385,55,Stage IIA
2,TCGA-EW-A1P6,Negative,Positive,Positive,HR Positive,38.241444,0.810570,0.643006,8.426938,0.606436,...,0.020894,0.018527,0.004438,0.006021,0,562,0,562,64,Stage IIB
3,TCGA-E2-A15S,Negative,Negative,Positive,HR Positive,37.934071,0.813011,0.654944,8.453368,0.610344,...,0.020599,0.023713,0.003215,0.005068,0,428,0,428,34,Stage IIB
4,TCGA-D8-A1X5,Positive,Positive,Positive,HER2 Positive,49.571533,0.811955,0.643151,9.492211,0.578170,...,0.019769,0.019458,0.003928,0.005174,0,565,0,565,81,Stage IIIC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551,TCGA-A2-A04N,Negative,Positive,Positive,HR Positive,40.167149,0.786355,0.661289,8.731260,0.652192,...,0.021173,0.020896,0.004519,0.003546,0,4354,0,4354,66,Stage IA
552,TCGA-AR-A1AS,Negative,Positive,Positive,HR Positive,47.793850,0.759587,0.734216,10.062342,0.616858,...,0.022948,0.024541,0.003706,0.005923,0,1150,0,1150,54,Stage IIB
553,TCGA-C8-A1HL,Positive,Negative,Positive,HER2 Positive,48.468864,0.763141,0.713281,9.954347,0.607404,...,0.022812,0.023739,0.004350,0.006741,0,317,0,317,38,Stage IIIA
554,TCGA-AN-A0FF,Negative,Positive,Positive,HR Positive,33.570347,0.766258,0.702174,8.304225,0.594162,...,0.022200,0.018067,0.003412,0.007163,0,172,0,172,32,Stage IA


In [23]:
# Path for the output CSV file
BrcaBiomarkerStatusToPathAI_nuHIFsToSurvival = f"{out_path}BrcaBiomarkerStatusToPathAI_nuHIFsToSurvival.csv"

# Write the merged dataframe to a new CSV file
merged_df1.to_csv(BrcaBiomarkerStatusToPathAI_nuHIFsToSurvival, index=False)

print("The files have been mapped and saved to:", BrcaBiomarkerStatusToPathAI_nuHIFsToSurvival)
print("Done!")

The files have been mapped and saved to: TCGA_BRCA_FFPE/outputs/PathAI/Survival_V2/BrcaBiomarkerStatusToPathAI_nuHIFsToSurvival.csv
Done!
