In [32]:
#### -----------------------------------------------------------------------------------------------------------------------------------------
#### author: Ranjan Barman, date: Jul 11, 2024
#### Mapped TCGA_BRCA advance sub-types data to PathAI HIF data
#### ------------------------------------------------------------------------------------------------------------------------------------------
import os, sys, pickle, bz2
import numpy as np, pandas as pd
from time import time 
from tqdm import tqdm

# Set working directory as the parent directory where all datasets are saved
_wpath_ = "/data/Lab_ruppin/Ranjan/HnE/"
os.chdir(_wpath_)
print(f"working directory = {_wpath_}\n")

#%% get TCGA_BRCA subtypes & input HIF features data.

# Define dataset folder name
dataset_name = "PA_HIF_BRCA"

# Define input paths for sample list, HIF data, and subtype data
data_path = "/data/Ruppin_AI/BRCA_PIF/data/HIFs_Original/"

data_files = ["brca_hifs.csv",        ## Contains PathAI HIFs original feature data with other extra columns      
              "TCGA_BRCA_Subtypes_Class_Clinical.csv"] # Contains subtype labels for BRCA

# Create directory to save outputs if it doesn't exist
outputs_path = f"{dataset_name}/clinical_tcga_brca_outputs/"
os.makedirs(outputs_path, exist_ok=True)

# Read PathAI HIFs data
hif_df = pd.read_csv(os.path.join(data_path, data_files[0]))


# Read TCGA BRCA subtype classification data
subtype_df = pd.read_csv(os.path.join(data_path, data_files[1]))


hif_df
# subtype_df



working directory = /data/Lab_ruppin/Ranjan/HnE/



  hif_df = pd.read_csv(os.path.join(data_path, data_files[0]))


Unnamed: 0,CASE_ID,H & E_ID,AREA (MM2) OF [EPITHELIAL] IN [TISSUE]_HE,AREA (MM2) OF [ESI_0080] IN [TISSUE]_HE,AREA (MM2) OF [NECROSIS] IN [TISSUE]_HE,AREA (MM2) OF [STROMA] IN [TISSUE]_HE,AREA (MM2) OF [TUMOR] IN [TISSUE]_HE,AREA PROP [[EPITHELIAL] OVER [TUMOR]] IN [TISSUE]_HE,AREA PROP [[ESI_0080] OVER [TUMOR]] IN [TISSUE]_HE,AREA PROP [[STROMA] OVER [TUMOR]] IN [TISSUE]_HE,...,bcr_patient_barcode,last_contact_days_to,new_tumor_event_site,new_tumor_event_type,initial_pathologic_dx_year,new_tumor_event_dx_days_to,new_tumor_event_site_other,ajcc_pathologic_tumor_stage,treatment_outcome_first_course,age_at_initial_pathologic_diagnosis
0,20717,20717,10.993680,12.069884,5.589280,7.446144,18.439824,0.596192,0.654555,0.403808,...,TCGA-PL-A8LZ,302.0,,,2013.0,,,Stage IIIB,[Not Available],29.0
1,21250,21250,0.152784,0.088300,0.087696,0.787456,0.940240,0.162495,0.093912,0.837505,...,TCGA-PL-A8LY,8.0,,,2013.0,,,Stage IIB,[Not Available],30.0
2,23657,23657,3.799696,5.788504,1.264720,5.547648,9.347344,0.406500,0.619267,0.593500,...,TCGA-PL-A8LV,-7.0,,,2013.0,,,Stage IIIB,[Not Available],54.0
3,26488,26488,4.016880,1.866552,4.361264,3.106640,7.123520,0.563890,0.262027,0.436110,...,TCGA-PL-A8LX,5.0,,,2013.0,,,Stage IV,[Not Available],35.0
4,26524,26524,0.152784,0.088300,0.087696,0.787456,0.940240,0.162495,0.093912,0.837505,...,TCGA-PL-A8LY,8.0,,,2013.0,,,Stage IIB,[Not Available],30.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1112,91896,91896,84.873312,126.567328,5.277904,131.999088,216.872400,0.391351,0.583603,0.608649,...,TCGA-AR-A1AS,1150.0,,,2007.0,,,Stage IIB,[Not Available],54.0
1113,91897,91897,59.296144,99.207272,2.358112,55.914608,115.210752,0.514675,0.861094,0.485325,...,TCGA-C8-A1HL,317.0,,,2010.0,,,Stage IIIA,[Not Available],38.0
1114,91901,91901,102.639328,108.589964,0.719472,71.078224,173.717552,0.590840,0.625095,0.409160,...,TCGA-LQ-A4E4,849.0,Bone,Distant Metastasis,2011.0,681.0,,Stage IIIA,[Not Available],73.0
1115,91908,91908,103.263728,159.315800,0.460560,88.305232,191.568960,0.539042,0.831637,0.460958,...,TCGA-AN-A0FF,172.0,,,2010.0,,,Stage IA,[Not Available],32.0


In [33]:
# Select 'bcr_patient_barcode' and columns 2 to 609 (inclusive)
selected_columns = ['bcr_patient_barcode'] + hif_df.columns[2:609].tolist()
hif_df_filtered = hif_df[selected_columns]
hif_df_filtered

Unnamed: 0,bcr_patient_barcode,AREA (MM2) OF [EPITHELIAL] IN [TISSUE]_HE,AREA (MM2) OF [ESI_0080] IN [TISSUE]_HE,AREA (MM2) OF [NECROSIS] IN [TISSUE]_HE,AREA (MM2) OF [STROMA] IN [TISSUE]_HE,AREA (MM2) OF [TUMOR] IN [TISSUE]_HE,AREA PROP [[EPITHELIAL] OVER [TUMOR]] IN [TISSUE]_HE,AREA PROP [[ESI_0080] OVER [TUMOR]] IN [TISSUE]_HE,AREA PROP [[STROMA] OVER [TUMOR]] IN [TISSUE]_HE,CELL CLUSTERING PROPERTIES: BALL-HALL INDEX OF CANCER IN EPITHELIAL_HE,...,TOTAL [LYMPHOCYTE CELLS] IN [STROMA]_HE,TOTAL [LYMPHOCYTE CELLS] IN [TUMOR]_HE,TOTAL [MACROPHAGE CELLS] IN [EPITHELIAL]_HE,TOTAL [MACROPHAGE CELLS] IN [ESI_0080]_HE,TOTAL [MACROPHAGE CELLS] IN [STROMA]_HE,TOTAL [MACROPHAGE CELLS] IN [TUMOR]_HE,TOTAL [PLASMA CELLS] IN [EPITHELIAL]_HE,TOTAL [PLASMA CELLS] IN [ESI_0080]_HE,TOTAL [PLASMA CELLS] IN [STROMA]_HE,TOTAL [PLASMA CELLS] IN [TUMOR]_HE
0,TCGA-PL-A8LZ,10.993680,12.069884,5.589280,7.446144,18.439824,0.596192,0.654555,0.403808,6711.424734,...,6139,9939,481,1064,1554,2035,449,541,376,825
1,TCGA-PL-A8LY,0.152784,0.088300,0.087696,0.787456,0.940240,0.162495,0.093912,0.837505,4336.717907,...,193,221,54,5,65,119,21,2,40,61
2,TCGA-PL-A8LV,3.799696,5.788504,1.264720,5.547648,9.347344,0.406500,0.619267,0.593500,6074.047540,...,4767,7584,200,441,629,829,202,249,241,443
3,TCGA-PL-A8LX,4.016880,1.866552,4.361264,3.106640,7.123520,0.563890,0.262027,0.436110,6801.983421,...,770,887,86,81,507,593,53,27,56,109
4,TCGA-PL-A8LY,0.152784,0.088300,0.087696,0.787456,0.940240,0.162495,0.093912,0.837505,4336.717907,...,193,221,54,5,65,119,21,2,40,61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1112,TCGA-AR-A1AS,84.873312,126.567328,5.277904,131.999088,216.872400,0.391351,0.583603,0.608649,6678.555146,...,35730,43884,4936,7564,10708,15644,1728,2429,2195,3923
1113,TCGA-C8-A1HL,59.296144,99.207272,2.358112,55.914608,115.210752,0.514675,0.861094,0.485325,6865.299931,...,19280,21334,880,5413,6925,7805,623,2862,3035,3658
1114,TCGA-LQ-A4E4,102.639328,108.589964,0.719472,71.078224,173.717552,0.590840,0.625095,0.409160,6539.283886,...,31770,69827,13693,10566,9428,23121,12288,11241,3697,15985
1115,TCGA-AN-A0FF,103.263728,159.315800,0.460560,88.305232,191.568960,0.539042,0.831637,0.460958,7373.703826,...,198042,250645,4704,28155,33254,37958,6262,13714,10201,16463


In [34]:
# Rename 'bcr_patient_barcode' to 'sample_id'
hif_df_filtered = hif_df_filtered.rename(columns={'bcr_patient_barcode': 'sample_id'})
hif_df_filtered

# Step 1: Average HIF rows if multiple rows per sample_id exist
hif_df_filtered_avg = hif_df_filtered.groupby('sample_id', as_index=False).mean()
hif_df_filtered_avg

Unnamed: 0,sample_id,AREA (MM2) OF [EPITHELIAL] IN [TISSUE]_HE,AREA (MM2) OF [ESI_0080] IN [TISSUE]_HE,AREA (MM2) OF [NECROSIS] IN [TISSUE]_HE,AREA (MM2) OF [STROMA] IN [TISSUE]_HE,AREA (MM2) OF [TUMOR] IN [TISSUE]_HE,AREA PROP [[EPITHELIAL] OVER [TUMOR]] IN [TISSUE]_HE,AREA PROP [[ESI_0080] OVER [TUMOR]] IN [TISSUE]_HE,AREA PROP [[STROMA] OVER [TUMOR]] IN [TISSUE]_HE,CELL CLUSTERING PROPERTIES: BALL-HALL INDEX OF CANCER IN EPITHELIAL_HE,...,TOTAL [LYMPHOCYTE CELLS] IN [STROMA]_HE,TOTAL [LYMPHOCYTE CELLS] IN [TUMOR]_HE,TOTAL [MACROPHAGE CELLS] IN [EPITHELIAL]_HE,TOTAL [MACROPHAGE CELLS] IN [ESI_0080]_HE,TOTAL [MACROPHAGE CELLS] IN [STROMA]_HE,TOTAL [MACROPHAGE CELLS] IN [TUMOR]_HE,TOTAL [PLASMA CELLS] IN [EPITHELIAL]_HE,TOTAL [PLASMA CELLS] IN [ESI_0080]_HE,TOTAL [PLASMA CELLS] IN [STROMA]_HE,TOTAL [PLASMA CELLS] IN [TUMOR]_HE
0,TCGA-3C-AALI,57.495120,107.705248,0.724840,102.294608,159.789728,0.357106,0.676589,0.642894,6441.311547,...,109710.0,123627.5,3774.5,20322.0,31994.5,35769.0,8453.0,18336.5,18637.5,27090.5
1,TCGA-3C-AALJ,82.541656,161.238112,3.687592,133.333344,215.875000,0.383636,0.747456,0.616364,6535.939707,...,54921.0,60213.5,2428.5,12801.5,22001.0,24429.5,1295.5,3917.0,5656.5,6952.0
2,TCGA-3C-AALK,26.877520,53.040932,3.571664,44.355952,71.233472,0.377316,0.744607,0.622684,5833.355998,...,41713.0,45076.0,1546.0,7789.0,12735.0,14281.0,729.0,2571.0,4174.0,4903.0
3,TCGA-4H-AAAK,43.128576,111.292004,0.742096,98.095856,141.224432,0.305390,0.788051,0.694610,7160.616676,...,35540.0,44382.0,1897.0,5594.0,9934.0,11831.0,1837.0,2488.0,1965.0,3802.0
4,TCGA-5L-AAT0,0.685216,2.596956,1.605504,8.142384,8.827600,0.077622,0.294186,0.922378,4479.384456,...,15786.0,17273.0,452.0,1604.0,4450.0,4902.0,181.0,289.0,527.0,708.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1037,TCGA-WT-AB44,9.903376,11.445204,0.085904,11.510544,21.413920,0.462474,0.534475,0.537526,5228.674659,...,5216.0,7137.0,1701.0,1250.0,2016.0,3717.0,270.0,322.0,486.0,756.0
1038,TCGA-XX-A899,29.783936,47.477228,0.007680,23.035728,52.819664,0.563880,0.898855,0.436120,6143.226420,...,8070.0,13332.0,1061.0,2049.0,1739.0,2800.0,3023.0,3154.0,735.0,3758.0
1039,TCGA-XX-A89A,49.370816,87.116700,0.016976,62.198960,111.569776,0.442511,0.780827,0.557489,5869.237691,...,27118.0,46429.0,8881.0,6618.0,5780.0,14661.0,3364.0,3586.0,1754.0,5118.0
1040,TCGA-Z7-A8R5,11.947152,20.351728,1.422320,12.491184,24.438336,0.488869,0.832779,0.511131,5604.175185,...,10171.0,18026.0,1739.0,2683.0,1693.0,3432.0,1439.0,1578.0,372.0,1811.0


In [35]:
# Create 'sample_id' by removing '-01' suffix to align with HIF sample IDs
subtype_df['sampleID'] = subtype_df['sampleID'].str.replace('-01', '', regex=False)
subtype_df

# Rename multiple columns for consistency
subtype_df = subtype_df.rename(columns={
    'sampleID': 'sample_id',
    'HER2_Final_Status_nature2012': 'HER2_Status',
    'PR_Status_nature2012': 'PR_Status',
    'ER_Status_nature2012': 'ER_Status'
})

subtype_df

Unnamed: 0,sample_id,HER2_Status,PR_Status,ER_Status
0,TCGA-A1-A0SB,Negative,Negative,Positive
1,TCGA-A1-A0SD,Negative,Positive,Positive
2,TCGA-A1-A0SE,Negative,Positive,Positive
3,TCGA-A1-A0SF,Negative,Positive,Positive
4,TCGA-A1-A0SG,Negative,Positive,Positive
...,...,...,...,...
739,TCGA-EW-A1PB,Negative,Negative,Negative
740,TCGA-EW-A1PD,Positive,Positive,Positive
741,TCGA-EW-A1PE,Negative,Positive,Positive
742,TCGA-EW-A1PF,Negative,Positive,Positive


In [36]:
# Merge on 'sample_id' (inner join to retain matched samples only)
merged_df = pd.merge(subtype_df, hif_df_filtered_avg, on='sample_id', how='inner')
merged_df

Unnamed: 0,sample_id,HER2_Status,PR_Status,ER_Status,AREA (MM2) OF [EPITHELIAL] IN [TISSUE]_HE,AREA (MM2) OF [ESI_0080] IN [TISSUE]_HE,AREA (MM2) OF [NECROSIS] IN [TISSUE]_HE,AREA (MM2) OF [STROMA] IN [TISSUE]_HE,AREA (MM2) OF [TUMOR] IN [TISSUE]_HE,AREA PROP [[EPITHELIAL] OVER [TUMOR]] IN [TISSUE]_HE,...,TOTAL [LYMPHOCYTE CELLS] IN [STROMA]_HE,TOTAL [LYMPHOCYTE CELLS] IN [TUMOR]_HE,TOTAL [MACROPHAGE CELLS] IN [EPITHELIAL]_HE,TOTAL [MACROPHAGE CELLS] IN [ESI_0080]_HE,TOTAL [MACROPHAGE CELLS] IN [STROMA]_HE,TOTAL [MACROPHAGE CELLS] IN [TUMOR]_HE,TOTAL [PLASMA CELLS] IN [EPITHELIAL]_HE,TOTAL [PLASMA CELLS] IN [ESI_0080]_HE,TOTAL [PLASMA CELLS] IN [STROMA]_HE,TOTAL [PLASMA CELLS] IN [TUMOR]_HE
0,TCGA-A1-A0SB,Negative,Negative,Positive,5.884160,8.505324,1.389632,12.406016,18.290176,0.321712,...,19166.0,24610.0,706.0,1362.0,2353.0,3059.0,825.0,949.0,1383.0,2208.0
1,TCGA-A1-A0SD,Negative,Positive,Positive,7.959952,15.879072,1.843632,18.089680,26.049632,0.305569,...,8826.0,10351.0,999.0,1207.0,2520.0,3519.0,311.0,412.0,523.0,834.0
2,TCGA-A1-A0SE,Negative,Positive,Positive,10.708352,20.749744,0.011680,17.366224,28.074576,0.381425,...,2739.0,3675.0,146.0,412.0,465.0,611.0,144.0,269.0,260.0,404.0
3,TCGA-A1-A0SF,Negative,Positive,Positive,1.600576,13.634196,0.061824,31.309600,32.910176,0.048635,...,37415.0,39063.0,1171.0,4474.0,8911.0,10082.0,252.0,1546.0,3023.0,3275.0
4,TCGA-A1-A0SH,Negative,Positive,Negative,6.305520,14.302604,0.235024,13.158768,19.464288,0.323953,...,5098.0,5776.0,310.0,442.0,936.0,1246.0,207.0,389.0,427.0,634.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718,TCGA-EW-A1PB,Negative,Negative,Negative,103.655120,92.218780,12.361056,41.098096,144.753216,0.716082,...,97532.0,167641.0,4818.0,17008.0,20185.0,25003.0,8546.0,14439.0,10629.0,19175.0
719,TCGA-EW-A1PD,Positive,Positive,Positive,12.700832,22.796892,0.269024,24.320192,37.021024,0.343071,...,7446.0,8802.0,859.0,1119.0,3124.0,3983.0,215.0,261.0,526.0,741.0
720,TCGA-EW-A1PE,Negative,Positive,Positive,87.381168,109.834856,1.369792,73.197504,160.578672,0.544164,...,32669.0,42024.0,2351.0,5264.0,7560.0,9911.0,3636.0,7865.0,7104.0,10740.0
721,TCGA-EW-A1PF,Negative,Positive,Positive,15.787328,39.763816,0.046576,34.822016,50.609344,0.311945,...,37521.0,40134.0,946.0,3141.0,6808.0,7754.0,453.0,1587.0,2399.0,2852.0


In [37]:
# Define path to save the merged output
tcga_brca_subtype_to_original_hifs = f"{outputs_path}tcga_brca_subtype_to_original_hifs.csv"

# Save the final merged dataframe to CSV
merged_df.to_csv(tcga_brca_subtype_to_original_hifs, index=False)

# Confirm completion
print("The files have been mapped and saved to:", tcga_brca_subtype_to_original_hifs)
print("Done!")

The files have been mapped and saved to: PA_HIF_BRCA/clinical_tcga_brca_outputs/tcga_brca_subtype_to_original_hifs.csv
Done!
