In [1]:
#### -----------------------------------------------------------------------------------------------------------------------------------------
#### author: Ranjan Barman, date: Aug 27, 2024
#### Mapped PathAI metadata to nuHIFs data to TCGA biomarker status
#### ------------------------------------------------------------------------------------------------------------------------------------------
import os, sys, pickle, bz2
import numpy as np, pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from time import time 
from tqdm import tqdm

_wpath_ = "/data/Lab_ruppin/Ranjan/HnE/"        # set working directory as the parent directory where all datasets are saved
os.chdir(_wpath_)

print(f"working directory = {_wpath_}\n")

working directory = /data/Lab_ruppin/Ranjan/HnE/



In [2]:
#%% get TCGA_BRCA subtypes & input HIF features data.


data_files   = ["PA_NUHIF_BRCA/outputs_biomarker_status/tcga_brca_subtype_to_original_nuhifs.csv",
                "PA_HIF_BRCA/clinical_tcga_brca_outputs/tcga_brca_subtype_to_original_hifs.csv"]

# Data directories & files
dataset_name = "PA_PIF_BRCA"
feature_name = "PIF"
outcome_names = ["HER2_Status", "PR_Status", "ER_Status"]

## create directories to save outputs
outputs_path = f"{dataset_name}/outputs_biomarker_status/"
os.makedirs(outputs_path, exist_ok = True)

#read metadata file of PathAI 
PathAI_hifs_with_status = pd.read_csv( data_files[0])

#read nuHIF file of PathAI 
PathAI_nuhifs_with_status = pd.read_csv(data_files[1])

PathAI_hifs_with_status
PathAI_nuhifs_with_status


Unnamed: 0,sample_id,HER2_Status,PR_Status,ER_Status,AREA (MM2) OF [EPITHELIAL] IN [TISSUE]_HE,AREA (MM2) OF [ESI_0080] IN [TISSUE]_HE,AREA (MM2) OF [NECROSIS] IN [TISSUE]_HE,AREA (MM2) OF [STROMA] IN [TISSUE]_HE,AREA (MM2) OF [TUMOR] IN [TISSUE]_HE,AREA PROP [[EPITHELIAL] OVER [TUMOR]] IN [TISSUE]_HE,...,TOTAL [LYMPHOCYTE CELLS] IN [STROMA]_HE,TOTAL [LYMPHOCYTE CELLS] IN [TUMOR]_HE,TOTAL [MACROPHAGE CELLS] IN [EPITHELIAL]_HE,TOTAL [MACROPHAGE CELLS] IN [ESI_0080]_HE,TOTAL [MACROPHAGE CELLS] IN [STROMA]_HE,TOTAL [MACROPHAGE CELLS] IN [TUMOR]_HE,TOTAL [PLASMA CELLS] IN [EPITHELIAL]_HE,TOTAL [PLASMA CELLS] IN [ESI_0080]_HE,TOTAL [PLASMA CELLS] IN [STROMA]_HE,TOTAL [PLASMA CELLS] IN [TUMOR]_HE
0,TCGA-A1-A0SB,Negative,Negative,Positive,5.884160,8.505324,1.389632,12.406016,18.290176,0.321712,...,19166.0,24610.0,706.0,1362.0,2353.0,3059.0,825.0,949.0,1383.0,2208.0
1,TCGA-A1-A0SD,Negative,Positive,Positive,7.959952,15.879072,1.843632,18.089680,26.049632,0.305569,...,8826.0,10351.0,999.0,1207.0,2520.0,3519.0,311.0,412.0,523.0,834.0
2,TCGA-A1-A0SE,Negative,Positive,Positive,10.708352,20.749744,0.011680,17.366224,28.074576,0.381425,...,2739.0,3675.0,146.0,412.0,465.0,611.0,144.0,269.0,260.0,404.0
3,TCGA-A1-A0SF,Negative,Positive,Positive,1.600576,13.634196,0.061824,31.309600,32.910176,0.048635,...,37415.0,39063.0,1171.0,4474.0,8911.0,10082.0,252.0,1546.0,3023.0,3275.0
4,TCGA-A1-A0SH,Negative,Positive,Negative,6.305520,14.302604,0.235024,13.158768,19.464288,0.323953,...,5098.0,5776.0,310.0,442.0,936.0,1246.0,207.0,389.0,427.0,634.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718,TCGA-EW-A1PB,Negative,Negative,Negative,103.655120,92.218780,12.361056,41.098096,144.753216,0.716082,...,97532.0,167641.0,4818.0,17008.0,20185.0,25003.0,8546.0,14439.0,10629.0,19175.0
719,TCGA-EW-A1PD,Positive,Positive,Positive,12.700832,22.796892,0.269024,24.320192,37.021024,0.343071,...,7446.0,8802.0,859.0,1119.0,3124.0,3983.0,215.0,261.0,526.0,741.0
720,TCGA-EW-A1PE,Negative,Positive,Positive,87.381168,109.834856,1.369792,73.197504,160.578672,0.544164,...,32669.0,42024.0,2351.0,5264.0,7560.0,9911.0,3636.0,7865.0,7104.0,10740.0
721,TCGA-EW-A1PF,Negative,Positive,Positive,15.787328,39.763816,0.046576,34.822016,50.609344,0.311945,...,37521.0,40134.0,946.0,3141.0,6808.0,7754.0,453.0,1587.0,2399.0,2852.0


In [3]:
# Drop the status columns from PathAI_hifs_with_status
PathAI_hifs_with_status = PathAI_hifs_with_status.drop(columns=['HER2_Status', 'PR_Status', 'ER_Status'])
PathAI_hifs_with_status

Unnamed: 0,sample_id,BRCA_Subtypes,MEAN[CANCER_NUCLEUS_AREA]_H & E,MEAN[CANCER_NUCLEUS_CIRCULARITY]_H & E,MEAN[CANCER_NUCLEUS_ECCENTRICITY]_H & E,MEAN[CANCER_NUCLEUS_MAJOR_AXIS_LENGTH]_H & E,MEAN[CANCER_NUCLEUS_MEAN_LAB_CHANNEL_A]_H & E,MEAN[CANCER_NUCLEUS_MEAN_LAB_CHANNEL_B]_H & E,MEAN[CANCER_NUCLEUS_MINOR_AXIS_LENGTH]_H & E,MEAN[CANCER_NUCLEUS_MIN_GRAYSCALE_CHANNEL_GRAY]_H & E,...,STD[LYMPHOCYTE_NUCLEUS_MEAN_LAB_CHANNEL_B]_H & E,STD[LYMPHOCYTE_NUCLEUS_MINOR_AXIS_LENGTH]_H & E,STD[LYMPHOCYTE_NUCLEUS_MIN_GRAYSCALE_CHANNEL_GRAY]_H & E,STD[LYMPHOCYTE_NUCLEUS_MIN_HSV_CHANNEL_SATURATION]_H & E,STD[LYMPHOCYTE_NUCLEUS_PERIMETER]_H & E,STD[LYMPHOCYTE_NUCLEUS_SOLIDITY]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_GRAYSCALE_CHANNEL_GRAY]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_HSV_CHANNEL_SATURATION]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_LAB_CHANNEL_A]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_LAB_CHANNEL_B]_H & E
0,TCGA-D8-A1Y2,HR Positive,34.365021,0.788894,0.682284,8.251385,0.602840,0.412379,5.653194,0.151201,...,0.007224,0.801812,0.051126,0.065717,3.289017,0.019602,0.020905,0.020543,0.003352,0.004804
1,TCGA-C8-A12V,TNBC,39.071213,0.784081,0.686660,8.686616,0.600544,0.413815,5.919993,0.170848,...,0.017781,0.727386,0.057589,0.072681,3.023650,0.019152,0.021876,0.021078,0.004379,0.006619
2,TCGA-EW-A1P6,HR Positive,38.241444,0.810570,0.643006,8.426938,0.606436,0.413195,6.092019,0.189393,...,0.008084,0.759774,0.039343,0.064644,2.844278,0.016114,0.020894,0.018527,0.004438,0.006021
3,TCGA-AR-A1AJ,HR Positive,37.569180,0.756555,0.742634,9.057047,0.615502,0.423559,5.576258,0.119670,...,0.007268,0.646987,0.033704,0.076382,2.803650,0.020363,0.021541,0.025585,0.003586,0.006296
4,TCGA-E2-A15S,HR Positive,37.934071,0.813011,0.654944,8.453368,0.610344,0.407051,6.011490,0.222011,...,0.007589,0.618312,0.048938,0.055562,2.364664,0.012161,0.020599,0.023713,0.003215,0.005068
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
577,TCGA-A2-A04N,HR Positive,40.167149,0.786355,0.661289,8.731260,0.652192,0.405773,6.183293,0.197981,...,0.006646,0.706071,0.044662,0.079656,2.822683,0.016338,0.021173,0.020896,0.004519,0.003546
578,TCGA-AR-A1AS,HR Positive,47.793850,0.759587,0.734216,10.062342,0.616858,0.429411,6.332615,0.131279,...,0.008048,0.731268,0.040083,0.078726,3.091555,0.018503,0.022948,0.024541,0.003706,0.005923
579,TCGA-C8-A1HL,HER2 Positive,48.468864,0.763141,0.713281,9.954347,0.607404,0.399618,6.505261,0.147581,...,0.013180,0.781547,0.060307,0.072402,3.213238,0.018057,0.022812,0.023739,0.004350,0.006741
580,TCGA-AN-A0FF,HR Positive,33.570347,0.766258,0.702174,8.304225,0.594162,0.405842,5.511056,0.127157,...,0.007930,0.731670,0.033249,0.076339,3.251431,0.021967,0.022200,0.018067,0.003412,0.007163


In [4]:
# Merge on 'sample_id' 
merged_df = pd.merge(PathAI_nuhifs_with_status, PathAI_hifs_with_status, on='sample_id', how='inner')
merged_df

Unnamed: 0,sample_id,HER2_Status,PR_Status,ER_Status,AREA (MM2) OF [EPITHELIAL] IN [TISSUE]_HE,AREA (MM2) OF [ESI_0080] IN [TISSUE]_HE,AREA (MM2) OF [NECROSIS] IN [TISSUE]_HE,AREA (MM2) OF [STROMA] IN [TISSUE]_HE,AREA (MM2) OF [TUMOR] IN [TISSUE]_HE,AREA PROP [[EPITHELIAL] OVER [TUMOR]] IN [TISSUE]_HE,...,STD[LYMPHOCYTE_NUCLEUS_MEAN_LAB_CHANNEL_B]_H & E,STD[LYMPHOCYTE_NUCLEUS_MINOR_AXIS_LENGTH]_H & E,STD[LYMPHOCYTE_NUCLEUS_MIN_GRAYSCALE_CHANNEL_GRAY]_H & E,STD[LYMPHOCYTE_NUCLEUS_MIN_HSV_CHANNEL_SATURATION]_H & E,STD[LYMPHOCYTE_NUCLEUS_PERIMETER]_H & E,STD[LYMPHOCYTE_NUCLEUS_SOLIDITY]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_GRAYSCALE_CHANNEL_GRAY]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_HSV_CHANNEL_SATURATION]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_LAB_CHANNEL_A]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_LAB_CHANNEL_B]_H & E
0,TCGA-A1-A0SB,Negative,Negative,Positive,5.884160,8.505324,1.389632,12.406016,18.290176,0.321712,...,0.012784,0.812228,0.073148,0.056193,3.400857,0.020177,0.021926,0.020004,0.003342,0.005770
1,TCGA-A1-A0SD,Negative,Positive,Positive,7.959952,15.879072,1.843632,18.089680,26.049632,0.305569,...,0.010152,0.822757,0.057387,0.056443,3.579359,0.022114,0.018869,0.017774,0.003357,0.004713
2,TCGA-A1-A0SE,Negative,Positive,Positive,10.708352,20.749744,0.011680,17.366224,28.074576,0.381425,...,0.007626,0.743510,0.069081,0.051078,3.303308,0.020030,0.020871,0.020930,0.003658,0.005619
3,TCGA-A1-A0SF,Negative,Positive,Positive,1.600576,13.634196,0.061824,31.309600,32.910176,0.048635,...,0.011495,0.654966,0.061102,0.069353,2.750405,0.017523,0.021017,0.019035,0.003750,0.008310
4,TCGA-A1-A0SH,Negative,Positive,Negative,6.305520,14.302604,0.235024,13.158768,19.464288,0.323953,...,0.008003,0.816648,0.038583,0.069684,3.736626,0.021366,0.018393,0.018119,0.003008,0.004937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
572,TCGA-EW-A1PB,Negative,Negative,Negative,103.655120,92.218780,12.361056,41.098096,144.753216,0.716082,...,0.008867,0.734930,0.036128,0.061353,2.915236,0.018231,0.021077,0.015968,0.003421,0.006313
573,TCGA-EW-A1PD,Positive,Positive,Positive,12.700832,22.796892,0.269024,24.320192,37.021024,0.343071,...,0.009527,0.815088,0.049075,0.074780,3.322919,0.023174,0.020318,0.017004,0.003177,0.005130
574,TCGA-EW-A1PE,Negative,Positive,Positive,87.381168,109.834856,1.369792,73.197504,160.578672,0.544164,...,0.007697,0.770643,0.050192,0.046928,3.052151,0.018066,0.018195,0.013952,0.003058,0.005056
575,TCGA-EW-A1PF,Negative,Positive,Positive,15.787328,39.763816,0.046576,34.822016,50.609344,0.311945,...,0.007349,0.763858,0.047087,0.057223,3.167193,0.017711,0.019919,0.015881,0.002874,0.005056


In [5]:
# Define list of selected feature columns to retain
selected_features = [
    'MEAN[CANCER_NUCLEUS_AREA]_H & E',
    'MEAN[CANCER_NUCLEUS_MAJOR_AXIS_LENGTH]_H & E',
    'MEAN[CANCER_NUCLEUS_MINOR_AXIS_LENGTH]_H & E',
    'MEAN[CANCER_NUCLEUS_PERIMETER]_H & E',
    'MEAN[CANCER_NUCLEUS_CIRCULARITY]_H & E',
    'MEAN[CANCER_NUCLEUS_ECCENTRICITY]_H & E',
    'STD[CANCER_NUCLEUS_AREA]_H & E',
    'STD[CANCER_NUCLEUS_MAJOR_AXIS_LENGTH]_H & E',
    'STD[CANCER_NUCLEUS_MINOR_AXIS_LENGTH]_H & E',
    'STD[CANCER_NUCLEUS_PERIMETER]_H & E',
    'STD[CANCER_NUCLEUS_CIRCULARITY]_H & E',
    'STD[CANCER_NUCLEUS_ECCENTRICITY]_H & E',
    'DENSITY [CANCER CELLS] IN [TUMOR]_HE',
    'DENSITY RATIO [CANCER CELLS] IN [[EPITHELIAL] OVER [TUMOR]]_HE',
    'AREA PROP [[EPITHELIAL] OVER [TUMOR]] IN [TISSUE]_HE',
    'AREA PROP [[ESI_0080] OVER [TUMOR]] IN [TISSUE]_HE',
    'AREA PROP [[STROMA] OVER [TUMOR]] IN [TISSUE]_HE',
    'REGION PROPERTIES: AVERAGE ECCENTRICITY OF SIGNIFICANT REGIONS OF TUMOR_HE',
    'REGION PROPERTIES: AVERAGE SOLIDITY OF SIGNIFICANT REGIONS OF TUMOR_HE',
    'REGION PROPERTIES: ECCENTRICITY OF LARGEST REGION OF TUMOR_HE',
    'STD[CANCER_NUCLEUS_MIN_GRAYSCALE_CHANNEL_GRAY]_H & E',
    'STD[CANCER_NUCLEUS_MIN_HSV_CHANNEL_SATURATION]_H & E',
    'REGION PROPERTIES: FILLED AREA (MM2) OF LARGEST REGION OF TUMOR_HE',
    'REGION PROPERTIES: LACUNARITY OF LARGEST REGION OF TUMOR_HE',
    'REGION PROPERTIES: LACUNARITY OF TUMOR_HE'
]

# Optional: Add sample_id and subtype columns to keep
meta_columns = ['sample_id', 'HER2_Status', 'PR_Status', 'ER_Status']

# Filter merged_df to keep only the selected columns
filtered_df = merged_df[meta_columns + selected_features]



In [6]:
# Path for the output CSV file
tcga_brca_subtype_to_original_pifs = f"{outputs_path}tcga_brca_subtype_to_original_pifs.csv"

# Write the merged dataframe to a new CSV file
filtered_df.to_csv(tcga_brca_subtype_to_original_pifs, index=False)

print("The files have been mapped and saved to:", tcga_brca_subtype_to_original_pifs)
print("Done!")

The files have been mapped and saved to: PA_PIF_BRCA/outputs_biomarker_status/tcga_brca_subtype_to_original_pifs.csv
Done!
