In [11]:
#### -----------------------------------------------------------------------------------------------------------------------------------------
#### author: Ranjan Barman, date: Aug 27, 2024
#### Mapped PathAI metadata to nuHIFs data to TCGA biomarker status
#### ------------------------------------------------------------------------------------------------------------------------------------------
import os, sys, pickle, bz2
import numpy as np, pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from time import time 
from tqdm import tqdm

_wpath_ = "/data/Lab_ruppin/Ranjan/HnE/"        # set working directory as the parent directory where all datasets are saved
os.chdir(_wpath_)

print(f"working directory = {_wpath_}\n")

working directory = /data/Lab_ruppin/Ranjan/HnE/



In [12]:
#%% get TCGA_BRCA subtypes & input HIF features data.

dataset_name = "PA_NUHIF_BRCA"

data_path    = [f"{dataset_name}/data/"]
data_files   = ["PathAI_BRCA_MetaData.xlsx",
                "PathAI_BRCA_NuHIFs.xlsx"]

# Data directories & files
dataset_name1 = "PA_HIF_BRCA"
feature_name1 = "HIF"
outcome_names = ["HER2_Status", "PR_Status", "ER_Status"]

data_path1 = f"{dataset_name1}/outputs_clinical/"
data_file1 = "TCGA_BRCA_Subtypes_clinical.tsv"

## create directories to save outputs
outputs_path = f"{dataset_name}/outputs_biomarker_status/"
os.makedirs(outputs_path, exist_ok = True)

#read metadata file of PathAI 
PathAI_meta_data = pd.read_excel(data_path[0] + data_files[0])

#read nuHIF file of PathAI 
PathAI_nuHIFs_data = pd.read_excel(data_path[0] + data_files[1])


# samples
PathAI_meta_data.head()
PathAI_nuHIFs_data.head()

Unnamed: 0,H & E_ID,IQR[(UNKNOWN)_NUCLEUS_AREA]_H & E,IQR[(UNKNOWN)_NUCLEUS_CIRCULARITY]_H & E,IQR[(UNKNOWN)_NUCLEUS_ECCENTRICITY]_H & E,IQR[(UNKNOWN)_NUCLEUS_MAJOR_AXIS_LENGTH]_H & E,IQR[(UNKNOWN)_NUCLEUS_MAX_GRAYSCALE_CHANNEL_GRAY]_H & E,IQR[(UNKNOWN)_NUCLEUS_MAX_HSV_CHANNEL_SATURATION]_H & E,IQR[(UNKNOWN)_NUCLEUS_MAX_LAB_CHANNEL_A]_H & E,IQR[(UNKNOWN)_NUCLEUS_MAX_LAB_CHANNEL_B]_H & E,IQR[(UNKNOWN)_NUCLEUS_MEAN_GRAYSCALE_CHANNEL_GRAY]_H & E,...,STD[PLASMA_NUCLEUS_MIN_HSV_CHANNEL_SATURATION]_H & E,STD[PLASMA_NUCLEUS_MIN_LAB_CHANNEL_A]_H & E,STD[PLASMA_NUCLEUS_MIN_LAB_CHANNEL_B]_H & E,STD[PLASMA_NUCLEUS_ORIENTATION]_H & E,STD[PLASMA_NUCLEUS_PERIMETER]_H & E,STD[PLASMA_NUCLEUS_SOLIDITY]_H & E,STD[PLASMA_NUCLEUS_STD_GRAYSCALE_CHANNEL_GRAY]_H & E,STD[PLASMA_NUCLEUS_STD_HSV_CHANNEL_SATURATION]_H & E,STD[PLASMA_NUCLEUS_STD_LAB_CHANNEL_A]_H & E,STD[PLASMA_NUCLEUS_STD_LAB_CHANNEL_B]_H & E
0,26524,24.361557,0.318226,0.187292,4.619149,0.290089,0.141176,0.031373,0.039216,0.167744,...,0.088273,0.012206,0.017756,1.213632,4.311863,0.010056,0.020215,0.0189,0.002124,0.003534
1,75662,18.216915,0.312133,0.221327,3.508829,0.227006,0.172549,0.031373,0.043137,0.131993,...,0.071921,0.020229,0.025833,0.94439,3.499922,0.021072,0.017867,0.015063,0.003185,0.005979
2,80086,15.960288,0.223923,0.200768,3.37242,0.169655,0.156863,0.023529,0.031373,0.173218,...,0.049966,0.017867,0.020963,0.8756,3.206401,0.013669,0.021685,0.012737,0.002807,0.004732
3,80092,19.491276,0.354271,0.221223,3.827091,0.183165,0.211765,0.027451,0.031373,0.163543,...,0.070411,0.017813,0.01683,0.907405,3.726454,0.018573,0.01895,0.016912,0.002859,0.004037
4,80099,13.616371,0.205485,0.224691,3.068618,0.1825,0.164706,0.019608,0.039216,0.143585,...,0.083541,0.024922,0.024043,0.927892,3.492507,0.020536,0.021883,0.020422,0.003853,0.005755


In [13]:
# 'H & E_ID' is the column name in df1 and 'H & E_ID' is the column name in df2
# Adjust these column names to match the actual column names in your CSV files
merged_df = pd.merge(PathAI_meta_data, PathAI_nuHIFs_data, left_on='H & E_ID', right_on='H & E_ID', how='inner')
merged_df

Unnamed: 0,H & E_ID,gender,vital_status,aneuploidy_score,hrd_score,genome_doublings,os,os_time,pfs,pfs_time,...,STD[PLASMA_NUCLEUS_MIN_HSV_CHANNEL_SATURATION]_H & E,STD[PLASMA_NUCLEUS_MIN_LAB_CHANNEL_A]_H & E,STD[PLASMA_NUCLEUS_MIN_LAB_CHANNEL_B]_H & E,STD[PLASMA_NUCLEUS_ORIENTATION]_H & E,STD[PLASMA_NUCLEUS_PERIMETER]_H & E,STD[PLASMA_NUCLEUS_SOLIDITY]_H & E,STD[PLASMA_NUCLEUS_STD_GRAYSCALE_CHANNEL_GRAY]_H & E,STD[PLASMA_NUCLEUS_STD_HSV_CHANNEL_SATURATION]_H & E,STD[PLASMA_NUCLEUS_STD_LAB_CHANNEL_A]_H & E,STD[PLASMA_NUCLEUS_STD_LAB_CHANNEL_B]_H & E
0,26524,FEMALE,Alive,0.0,0.0,0.0,0,8,0,8,...,0.088273,0.012206,0.017756,1.213632,4.311863,0.010056,0.020215,0.018900,0.002124,0.003534
1,75662,FEMALE,Dead,3.0,10.0,0.0,1,2009,1,2009,...,0.071921,0.020229,0.025833,0.944390,3.499922,0.021072,0.017867,0.015063,0.003185,0.005979
2,80086,FEMALE,Alive,,,,0,446,0,446,...,0.049966,0.017867,0.020963,0.875600,3.206401,0.013669,0.021685,0.012737,0.002807,0.004732
3,80092,FEMALE,Alive,10.0,12.0,0.0,0,433,0,433,...,0.070411,0.017813,0.016830,0.907405,3.726454,0.018573,0.018950,0.016912,0.002859,0.004037
4,80099,FEMALE,Alive,,,,0,385,0,385,...,0.083541,0.024922,0.024043,0.927892,3.492507,0.020536,0.021883,0.020422,0.003853,0.005755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
881,91896,FEMALE,Alive,6.0,9.0,0.0,0,1150,0,1150,...,0.079903,0.018760,0.022747,0.884152,3.281347,0.017079,0.020732,0.018555,0.004168,0.005220
882,91897,FEMALE,Alive,11.0,17.0,1.0,0,317,0,317,...,0.074087,0.023444,0.021649,0.889008,3.279637,0.015747,0.020919,0.019710,0.003739,0.006685
883,91901,FEMALE,Alive,,,,0,849,1,681,...,0.088994,0.018527,0.015926,0.955501,3.587269,0.017242,0.022089,0.017854,0.003783,0.004299
884,91908,FEMALE,Alive,3.0,11.0,0.0,0,172,0,172,...,0.074113,0.016598,0.023222,0.901696,3.542597,0.024372,0.018528,0.014444,0.003381,0.005702


In [14]:
# Define the patterns to look for at the start of column names
start_patterns = [
    'MEAN[CANCER', 
    'MEAN[FIBROBLAST', 
    'MEAN[LYMPHOCYTE', 
    'STD[CANCER', 
    'STD[FIBROBLAST', 
    'STD[LYMPHOCYTE'
]

# Define the strings that should be present in the column names
required_strings = [
    'AREA', 
    'MAJOR_AXIS_LENGTH', 
    'MINOR_AXIS_LENGTH', 
    'PERIMETER', 
    'CIRCULARITY', 
    'ECCENTRICITY', 
    'SOLIDITY', 
    'STD_GRAYSCALE_CHANNEL_GRAY', 
    'STD_HSV_CHANNEL_SATURATION', 
    'STD_LAB_CHANNEL_A', 
    'STD_LAB_CHANNEL_B', 
    'MIN_GRAYSCALE_CHANNEL_GRAY', 
    'MIN_HSV_CHANNEL_SATURATION', 
    'MEAN_LAB_CHANNEL_A', 
    'MEAN_LAB_CHANNEL_B'
]

# Extract columns that start with the specified patterns and contain any of the required strings
cancer_fibroblast_lymphocyte_columns = [
    col for col in merged_df.columns if 
    any(col.startswith(pattern) for pattern in start_patterns) and 
    any(req_str in col for req_str in required_strings)
]

# Ensure 'bcr_patient_barcode' and 'subtype' are present in the DataFrame
# Add them to the start of the filtered columns
essential_columns = ['bcr_patient_barcode']

# Combine the essential columns with cancer_fibroblast_lymphocyte_columns
final_columns = essential_columns + cancer_fibroblast_lymphocyte_columns


# Extract the relevant data
extracted_subtype_nuHIFs = merged_df[final_columns]

#Rename the bcr_patient_barcode column to sample
extracted_subtype_nuHIFs = extracted_subtype_nuHIFs.rename(columns={'bcr_patient_barcode': 'sample'})

extracted_subtype_nuHIFs


Unnamed: 0,sample,MEAN[CANCER_NUCLEUS_AREA]_H & E,MEAN[CANCER_NUCLEUS_CIRCULARITY]_H & E,MEAN[CANCER_NUCLEUS_ECCENTRICITY]_H & E,MEAN[CANCER_NUCLEUS_MAJOR_AXIS_LENGTH]_H & E,MEAN[CANCER_NUCLEUS_MEAN_LAB_CHANNEL_A]_H & E,MEAN[CANCER_NUCLEUS_MEAN_LAB_CHANNEL_B]_H & E,MEAN[CANCER_NUCLEUS_MINOR_AXIS_LENGTH]_H & E,MEAN[CANCER_NUCLEUS_MIN_GRAYSCALE_CHANNEL_GRAY]_H & E,MEAN[CANCER_NUCLEUS_MIN_HSV_CHANNEL_SATURATION]_H & E,...,STD[LYMPHOCYTE_NUCLEUS_MEAN_LAB_CHANNEL_B]_H & E,STD[LYMPHOCYTE_NUCLEUS_MINOR_AXIS_LENGTH]_H & E,STD[LYMPHOCYTE_NUCLEUS_MIN_GRAYSCALE_CHANNEL_GRAY]_H & E,STD[LYMPHOCYTE_NUCLEUS_MIN_HSV_CHANNEL_SATURATION]_H & E,STD[LYMPHOCYTE_NUCLEUS_PERIMETER]_H & E,STD[LYMPHOCYTE_NUCLEUS_SOLIDITY]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_GRAYSCALE_CHANNEL_GRAY]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_HSV_CHANNEL_SATURATION]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_LAB_CHANNEL_A]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_LAB_CHANNEL_B]_H & E
0,TCGA-PL-A8LY,37.870327,0.752129,0.740944,9.088960,0.594218,0.405748,5.675256,0.135048,0.313856,...,0.009443,0.957306,0.040454,0.089436,3.992348,0.058896,0.024395,0.020367,0.004481,0.005192
1,TCGA-BH-A18S,38.068584,0.803479,0.667058,8.564404,0.610667,0.416995,6.018374,0.130621,0.255800,...,0.010207,0.837812,0.040262,0.074601,3.188401,0.018339,0.020380,0.018566,0.003950,0.007380
2,TCGA-A7-A4SC,32.376812,0.795850,0.679380,7.991061,0.578427,0.431150,5.526413,0.226366,0.133215,...,0.006967,0.841132,0.047287,0.046261,3.218295,0.017572,0.021008,0.015158,0.003017,0.005687
3,TCGA-D8-A1Y2,34.365021,0.788894,0.682284,8.251385,0.602840,0.412379,5.653194,0.151201,0.219145,...,0.007224,0.801812,0.051126,0.065717,3.289017,0.019602,0.020905,0.020543,0.003352,0.004804
4,TCGA-C8-A12V,39.071213,0.784081,0.686660,8.686616,0.600544,0.413815,5.919993,0.170848,0.118866,...,0.017781,0.727386,0.057589,0.072681,3.023650,0.019152,0.021876,0.021078,0.004379,0.006619
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
881,TCGA-AR-A1AS,47.793850,0.759587,0.734216,10.062342,0.616858,0.429411,6.332615,0.131279,0.251252,...,0.008048,0.731268,0.040083,0.078726,3.091555,0.018503,0.022948,0.024541,0.003706,0.005923
882,TCGA-C8-A1HL,48.468864,0.763141,0.713281,9.954347,0.607404,0.399618,6.505261,0.147581,0.205403,...,0.013180,0.781547,0.060307,0.072402,3.213238,0.018057,0.022812,0.023739,0.004350,0.006741
883,TCGA-LQ-A4E4,40.482265,0.776762,0.680984,8.898990,0.597539,0.422517,6.147808,0.093983,0.201440,...,0.009824,0.876126,0.033074,0.090117,3.759877,0.025011,0.023226,0.022701,0.003968,0.004428
884,TCGA-AN-A0FF,33.570347,0.766258,0.702174,8.304225,0.594162,0.405842,5.511056,0.127157,0.241588,...,0.007930,0.731670,0.033249,0.076339,3.251431,0.021967,0.022200,0.018067,0.003412,0.007163


In [15]:
# TCGA BRCA sub type data
tcga_subtypes_data   = pd.read_table(data_path1 + data_file1, sep = "\t")

# Remove '-01' from each sampleID to match with HIF Sample ID
tcga_subtypes_data['sampleID'] = tcga_subtypes_data['sampleID'].str.replace('-01', '', regex=False)

# Rename the columns in tcga_subtypes_data
tcga_subtypes_data = tcga_subtypes_data.rename(columns={"HER2_Final_Status_nature2012": "HER2_Status", "PR_Status_nature2012": "PR_Status", "ER_Status_nature2012": "ER_Status"})

tcga_subtypes_data

Unnamed: 0,sampleID,HER2_Status,PR_Status,ER_Status,BRCA_Subtypes
0,TCGA-A1-A0SB,Negative,Negative,Positive,HR Positive
1,TCGA-A1-A0SD,Negative,Positive,Positive,HR Positive
2,TCGA-A1-A0SE,Negative,Positive,Positive,HR Positive
3,TCGA-A1-A0SF,Negative,Positive,Positive,HR Positive
4,TCGA-A1-A0SG,Negative,Positive,Positive,HR Positive
...,...,...,...,...,...
739,TCGA-EW-A1PB,Negative,Negative,Negative,TNBC
740,TCGA-EW-A1PD,Positive,Positive,Positive,HER2 Positive
741,TCGA-EW-A1PE,Negative,Positive,Positive,HR Positive
742,TCGA-EW-A1PF,Negative,Positive,Positive,HR Positive


In [16]:
# '# 'sample' is the column name in df1 and 'sampleID' is the column name in df2
# Adjust these column names to match the actual column names in your CSV files
merged_df1 = pd.merge(extracted_subtype_nuHIFs, tcga_subtypes_data, left_on='sample', right_on='sampleID', how='inner')

merged_df1

Unnamed: 0,sample,MEAN[CANCER_NUCLEUS_AREA]_H & E,MEAN[CANCER_NUCLEUS_CIRCULARITY]_H & E,MEAN[CANCER_NUCLEUS_ECCENTRICITY]_H & E,MEAN[CANCER_NUCLEUS_MAJOR_AXIS_LENGTH]_H & E,MEAN[CANCER_NUCLEUS_MEAN_LAB_CHANNEL_A]_H & E,MEAN[CANCER_NUCLEUS_MEAN_LAB_CHANNEL_B]_H & E,MEAN[CANCER_NUCLEUS_MINOR_AXIS_LENGTH]_H & E,MEAN[CANCER_NUCLEUS_MIN_GRAYSCALE_CHANNEL_GRAY]_H & E,MEAN[CANCER_NUCLEUS_MIN_HSV_CHANNEL_SATURATION]_H & E,...,STD[LYMPHOCYTE_NUCLEUS_SOLIDITY]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_GRAYSCALE_CHANNEL_GRAY]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_HSV_CHANNEL_SATURATION]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_LAB_CHANNEL_A]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_LAB_CHANNEL_B]_H & E,sampleID,HER2_Status,PR_Status,ER_Status,BRCA_Subtypes
0,TCGA-D8-A1Y2,34.365021,0.788894,0.682284,8.251385,0.602840,0.412379,5.653194,0.151201,0.219145,...,0.019602,0.020905,0.020543,0.003352,0.004804,TCGA-D8-A1Y2,Negative,Positive,Positive,HR Positive
1,TCGA-C8-A12V,39.071213,0.784081,0.686660,8.686616,0.600544,0.413815,5.919993,0.170848,0.118866,...,0.019152,0.021876,0.021078,0.004379,0.006619,TCGA-C8-A12V,Negative,Negative,Negative,TNBC
2,TCGA-EW-A1P6,38.241444,0.810570,0.643006,8.426938,0.606436,0.413195,6.092019,0.189393,0.159895,...,0.016114,0.020894,0.018527,0.004438,0.006021,TCGA-EW-A1P6,Negative,Positive,Positive,HR Positive
3,TCGA-AR-A1AJ,37.569180,0.756555,0.742634,9.057047,0.615502,0.423559,5.576258,0.119670,0.310942,...,0.020363,0.021541,0.025585,0.003586,0.006296,TCGA-AR-A1AJ,Negative,Negative,Positive,HR Positive
4,TCGA-E2-A15S,37.934071,0.813011,0.654944,8.453368,0.610344,0.407051,6.011490,0.222011,0.180906,...,0.012161,0.020599,0.023713,0.003215,0.005068,TCGA-E2-A15S,Negative,Negative,Positive,HR Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
577,TCGA-A2-A04N,40.167149,0.786355,0.661289,8.731260,0.652192,0.405773,6.183293,0.197981,0.310632,...,0.016338,0.021173,0.020896,0.004519,0.003546,TCGA-A2-A04N,Negative,Positive,Positive,HR Positive
578,TCGA-AR-A1AS,47.793850,0.759587,0.734216,10.062342,0.616858,0.429411,6.332615,0.131279,0.251252,...,0.018503,0.022948,0.024541,0.003706,0.005923,TCGA-AR-A1AS,Negative,Positive,Positive,HR Positive
579,TCGA-C8-A1HL,48.468864,0.763141,0.713281,9.954347,0.607404,0.399618,6.505261,0.147581,0.205403,...,0.018057,0.022812,0.023739,0.004350,0.006741,TCGA-C8-A1HL,Positive,Negative,Positive,HER2 Positive
580,TCGA-AN-A0FF,33.570347,0.766258,0.702174,8.304225,0.594162,0.405842,5.511056,0.127157,0.241588,...,0.021967,0.022200,0.018067,0.003412,0.007163,TCGA-AN-A0FF,Negative,Positive,Positive,HR Positive


In [17]:
# Remove the 'sampleID' column
merged_df1 = merged_df1.drop(columns=['sampleID'])

# Define the desired order for the specified columns
desired_columns = ['HER2_Status', 'PR_Status', 'ER_Status', 'BRCA_Subtypes']

# Get the list of existing columns excluding the desired columns
remaining_columns = [col for col in merged_df1.columns if col not in desired_columns and col != 'sample']

# Reorder the columns: 'sample' followed by desired_columns and then the remaining columns
new_column_order = ['sample'] + desired_columns + remaining_columns

# Apply the new column order to the DataFrame
merged_df1 = merged_df1[new_column_order]

# Rename 'sample' to 'sample_id'
merged_df1 = merged_df1.rename(columns={'sample': 'sample_id'})
merged_df1

# Display the resulting DataFrame
merged_df1

Unnamed: 0,sample_id,HER2_Status,PR_Status,ER_Status,BRCA_Subtypes,MEAN[CANCER_NUCLEUS_AREA]_H & E,MEAN[CANCER_NUCLEUS_CIRCULARITY]_H & E,MEAN[CANCER_NUCLEUS_ECCENTRICITY]_H & E,MEAN[CANCER_NUCLEUS_MAJOR_AXIS_LENGTH]_H & E,MEAN[CANCER_NUCLEUS_MEAN_LAB_CHANNEL_A]_H & E,...,STD[LYMPHOCYTE_NUCLEUS_MEAN_LAB_CHANNEL_B]_H & E,STD[LYMPHOCYTE_NUCLEUS_MINOR_AXIS_LENGTH]_H & E,STD[LYMPHOCYTE_NUCLEUS_MIN_GRAYSCALE_CHANNEL_GRAY]_H & E,STD[LYMPHOCYTE_NUCLEUS_MIN_HSV_CHANNEL_SATURATION]_H & E,STD[LYMPHOCYTE_NUCLEUS_PERIMETER]_H & E,STD[LYMPHOCYTE_NUCLEUS_SOLIDITY]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_GRAYSCALE_CHANNEL_GRAY]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_HSV_CHANNEL_SATURATION]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_LAB_CHANNEL_A]_H & E,STD[LYMPHOCYTE_NUCLEUS_STD_LAB_CHANNEL_B]_H & E
0,TCGA-D8-A1Y2,Negative,Positive,Positive,HR Positive,34.365021,0.788894,0.682284,8.251385,0.602840,...,0.007224,0.801812,0.051126,0.065717,3.289017,0.019602,0.020905,0.020543,0.003352,0.004804
1,TCGA-C8-A12V,Negative,Negative,Negative,TNBC,39.071213,0.784081,0.686660,8.686616,0.600544,...,0.017781,0.727386,0.057589,0.072681,3.023650,0.019152,0.021876,0.021078,0.004379,0.006619
2,TCGA-EW-A1P6,Negative,Positive,Positive,HR Positive,38.241444,0.810570,0.643006,8.426938,0.606436,...,0.008084,0.759774,0.039343,0.064644,2.844278,0.016114,0.020894,0.018527,0.004438,0.006021
3,TCGA-AR-A1AJ,Negative,Negative,Positive,HR Positive,37.569180,0.756555,0.742634,9.057047,0.615502,...,0.007268,0.646987,0.033704,0.076382,2.803650,0.020363,0.021541,0.025585,0.003586,0.006296
4,TCGA-E2-A15S,Negative,Negative,Positive,HR Positive,37.934071,0.813011,0.654944,8.453368,0.610344,...,0.007589,0.618312,0.048938,0.055562,2.364664,0.012161,0.020599,0.023713,0.003215,0.005068
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
577,TCGA-A2-A04N,Negative,Positive,Positive,HR Positive,40.167149,0.786355,0.661289,8.731260,0.652192,...,0.006646,0.706071,0.044662,0.079656,2.822683,0.016338,0.021173,0.020896,0.004519,0.003546
578,TCGA-AR-A1AS,Negative,Positive,Positive,HR Positive,47.793850,0.759587,0.734216,10.062342,0.616858,...,0.008048,0.731268,0.040083,0.078726,3.091555,0.018503,0.022948,0.024541,0.003706,0.005923
579,TCGA-C8-A1HL,Positive,Negative,Positive,HER2 Positive,48.468864,0.763141,0.713281,9.954347,0.607404,...,0.013180,0.781547,0.060307,0.072402,3.213238,0.018057,0.022812,0.023739,0.004350,0.006741
580,TCGA-AN-A0FF,Negative,Positive,Positive,HR Positive,33.570347,0.766258,0.702174,8.304225,0.594162,...,0.007930,0.731670,0.033249,0.076339,3.251431,0.021967,0.022200,0.018067,0.003412,0.007163


In [18]:
# Path for the output CSV file
tcga_brca_subtype_to_original_nuhifs = f"{outputs_path}tcga_brca_subtype_to_original_nuhifs.csv"

# Write the merged dataframe to a new CSV file
merged_df1.to_csv(tcga_brca_subtype_to_original_nuhifs, index=False)

print("The files have been mapped and saved to:", tcga_brca_subtype_to_original_nuhifs)
print("Done!")

The files have been mapped and saved to: PA_NUHIF_BRCA/outputs_biomarker_status/tcga_brca_subtype_to_original_nuhifs.csv
Done!
