In [1]:
import pandas as pd
import numpy as np
import glob
import os
from datetime import datetime
import json

## Collect the conversion relation between bcr_patient_uuid and bcr_sample_barcode

In [2]:
uuid2barcode = {} # store the conversion in dict
path = "../data/biomed_clinic_data/01_csv_data"
all_files = glob.glob(os.path.join(path, "*.csv"))
for f in all_files:
    df = pd.read_csv(f, sep=",")
    if "bcr_patient_uuid" in df and "bcr_patient_barcode" in df:
        for i in range(2, df.shape[0]):
            if df["bcr_patient_uuid"][i] not in uuid2barcode:
                uuid2barcode[df["bcr_patient_uuid"][i]] = (
                    df["bcr_patient_barcode"][i] + "-01"
                )

In [3]:
# store the conversion if wanted
with open("uuid2barcode.json", "w") as file:
    json.dump(uuid2barcode, file)

## Merge the original multi-omics and biomed-clinical data using bcr_sample_barcode as index

In [4]:
def merge_omics_biomed(omic_df, biomed_df, uuid2barcode, merged_name):
    (num_patients, num_features) = omic_df.shape
    print(
        f"Number of features: {num_features} \t Number of patients: {num_patients} in omics data"
    )

    (num_patients, num_features) = biomed_df.shape
    print(
        f"Number of features: {num_features} \t Number of patients: {num_patients} in biomed data"
    )

    # change uuid to barcode
    biomed_df["bcr_patient_barcode"] = "TEST"
    for i in range(biomed_df.shape[0]):
        if biomed_df["bcr_patient_uuid"][i] in uuid2barcode:
            biomed_df["bcr_patient_barcode"][i] = uuid2barcode[
                biomed_df["bcr_patient_uuid"][i]
            ]
        else:
            biomed_df["bcr_patient_barcode"][i] = None
    biomed_df.drop(columns=["bcr_patient_uuid"], inplace=True)

    # Temporary only take the "RECURRENCE" data
    target_df = biomed_df
    target_df = target_df.drop_duplicates()
    target_df = target_df.set_index("bcr_patient_barcode")

    (num_patients, num_features) = target_df.shape
    print(
        f"Number of features: {num_features} \t Number of patients: {num_patients} in omics data"
    )

    (num_patients, num_features) = omic_df.shape
    print(
        f"Number of features: {num_features} \t Number of patients: {num_patients} in biomed data"
    )
    
    merged_df = pd.merge(target_df, omic_df, left_index=True, right_index=True)
    
    # move "RECURRENCE" to the last column
    column_to_reorder = merged_df.pop("RECURRENCE")
    merged_df.insert(len(merged_df.columns), "RECURRENCE", column_to_reorder)

    (num_patients, num_features) = merged_df.shape
    print(
        f"Number of features: {num_features} \t Number of patients: {num_patients} in biomed data"
    )
    
    merged_df.to_csv("./original_omics_biomed_clinical_data/{}".format(merged_name))

    return target_df, omic_df, merged_df

In [5]:
# Load multi-omics data into dataframe
omics = "cnv_methyl_mrna"
omic_df = pd.read_csv(
        "../data/omics_data/1_csv_data/{}.csv".format(omics), index_col=0
    ).T
omic_df = omic_df.astype("float32")

# Load biomed-clinical data into dataframe
biomed = "biomed_clinical_85features_w_barcodes"
biomed_df = (
    pd.read_csv("../data/biomed_clinic_data/02_combined_data/{}.csv".format(biomed), index_col=0,)
    .drop_duplicates()
    .drop(columns=["bcr_sample_barcode"])
    .reset_index()
)

# Load the conversion between uuid and barcode
with open("uuid2barcode.json", "r") as file:
    uuid2barcode = json.load(file)
    
# merge multi-omics and biomed-clinical data
merged_name = "{}_biomed_clinical_85_features.csv".format(omics)
biomed_df, omic_df, merged_df = merge_omics_biomed(omic_df, biomed_df, uuid2barcode, merged_name)

Number of features: 58512 	 Number of patients: 555 in omics data
Number of features: 85 	 Number of patients: 956 in biomed data
Number of features: 84 	 Number of patients: 467 in omics data
Number of features: 58512 	 Number of patients: 555 in biomed data
Number of features: 58596 	 Number of patients: 450 in biomed data


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [6]:
biomed_df.head()

Unnamed: 0_level_0,concentration,normal_tumor_genotype_match,pcr_amplification_successful,intermediate_dimension,is_ffpe,longest_dimension,sample_type_id,shortest_dimension,percent_necrosis,percent_normal_cells,...,Stage IA,Stage IB,Stage IC,Stage IIA,Stage IIB,Stage IIC,Stage IIIA,Stage IIIB,Stage IIIC,Stage IV
bcr_patient_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-10-0926-01,0.15,1,1.0,0.8,0,1.0,1.0,0.6,0,2.0,...,0,0,0,0,0,0,0,0,1,0
TCGA-10-0927-01,0.14,1,1.0,0.7,0,1.1,1.0,0.6,0,5.0,...,0,0,0,0,0,0,0,0,1,0
TCGA-10-0930-01,0.1,1,0.317005,0.8,0,0.8,1.0,0.3,0,0.0,...,0,0,0,0,0,0,0,0,1,0
TCGA-10-0931-01,0.14,1,1.0,0.5,0,0.5,1.0,0.5,0,0.0,...,0,0,0,0,0,0,0,0,1,0
TCGA-10-0933-01,0.15,1,1.0,0.4,0,1.0,1.0,0.4,0,5.0,...,0,0,0,0,0,0,0,0,1,0


In [7]:
omic_df.head()

Unnamed: 0,ACAP3,ACTRT2,AGRN,ANKRD65,ATAD3A,ATAD3B,ATAD3C,AURKAIP1,B3GALT6,C1orf159,...,SELE,SLC7A10,SLC7A11,FOLR1,NPY5R,GNGT1,PTRF,SERPINH1,NFIX,SELP
TCGA-04-1331-01,-0.703,-0.703,-0.703,-0.703,-0.703,-0.703,-0.703,-0.703,-0.703,-0.703,...,3.115546,3.489387,4.690337,8.928609,3.188657,3.541694,5.789259,7.104112,4.341545,3.584882
TCGA-04-1332-01,0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.08,...,3.325636,3.136609,4.881767,6.034238,3.623994,3.517472,7.057333,7.898462,3.953707,3.884114
TCGA-04-1335-01,-0.807,-0.807,-0.807,-0.807,-0.807,-0.807,-0.807,-0.807,-0.807,-0.807,...,3.15942,3.599663,4.846113,8.604194,3.314188,4.060605,5.037189,7.150663,4.53003,3.169389
TCGA-04-1336-01,0.101,0.101,0.101,0.101,0.101,0.101,0.101,0.101,0.101,0.101,...,2.773288,3.359266,3.977673,9.625358,3.379569,4.004407,5.565058,7.214044,4.347416,3.58165
TCGA-04-1337-01,0.021,0.021,0.021,0.021,0.021,0.021,0.021,0.021,0.021,0.021,...,3.041457,3.25286,5.032966,10.025802,3.089731,3.378608,6.881126,7.998164,3.840921,3.795352


In [8]:
merged_df.head()

Unnamed: 0,concentration,normal_tumor_genotype_match,pcr_amplification_successful,intermediate_dimension,is_ffpe,longest_dimension,sample_type_id,shortest_dimension,percent_necrosis,percent_normal_cells,...,SLC7A10,SLC7A11,FOLR1,NPY5R,GNGT1,PTRF,SERPINH1,NFIX,SELP,RECURRENCE
TCGA-10-0926-01,0.15,1,1.0,0.8,0,1.0,1.0,0.6,0,2.0,...,3.318714,5.28057,8.460801,3.159805,4.456984,6.21104,8.374049,4.083338,3.10046,1
TCGA-10-0927-01,0.14,1,1.0,0.7,0,1.1,1.0,0.6,0,5.0,...,3.483028,4.749874,10.052485,3.116182,4.423287,6.843497,7.538854,5.303522,3.264206,0
TCGA-10-0930-01,0.1,1,0.317005,0.8,0,0.8,1.0,0.3,0,0.0,...,3.450409,6.067577,7.224591,3.471561,3.369886,7.21528,7.692144,3.545209,3.187816,0
TCGA-10-0931-01,0.14,1,1.0,0.5,0,0.5,1.0,0.5,0,0.0,...,3.338876,4.878944,8.498856,3.29276,3.245272,6.191794,7.628787,3.836125,3.393124,1
TCGA-10-0933-01,0.15,1,1.0,0.4,0,1.0,1.0,0.4,0,5.0,...,4.220882,3.867453,5.216125,3.407106,3.89327,6.710351,8.377097,3.754998,3.598371,0


## Merge the encoded omics data with the biomed-clinical data using bcr_patient_barcode as index

In [None]:
# Load multi-omics data into dataframe
omics = "cnv_methyl_mrna"
omic_df = pd.read_csv(
        "../data/omics_data/1_csv_data/{}.csv".format(omics), index_col=0
    ).T
omic_df = omic_df.astype("float32")

# Load biomed-clinical data into dataframe
biomed = "85_features_w_barcodes"
biomed_df = (
    pd.read_csv("../data/biomed_clinic_data/02_combined_data/{}.csv".format(biomed), index_col=0,)
    .drop_duplicates()
    .drop(columns=["bcr_sample_barcode"])
    .reset_index()
)

# Load the conversion between uuid and barcode
with open("uuid2barcode.json", "r") as file:
    uuid2barcode = json.load(file)
    
# merge multi-omics and biomed-clinical data
merged_name = "{}_biomed_clinical_85_features.csv".format(omics)
merged_df = merge_omics_biomed(omic_df, biomed_df, uuid2barcode, merged_name)