## Find the intersection of patients across the five modalities and create a val-test-train split

In [1]:
import pandas as pd
import numpy as np
import glob

In [None]:
PRCD_DATA_PATH =  ".../TCGA/data_processed/"
COMBINED_DATA_PATH = ".../TCGA/combined/"

In [37]:
trans = pd.read_csv(PRCD_DATA_PATH + "PRCSD_transcriptomic_data.csv")
cnv = pd.read_csv(PRCD_DATA_PATH + "PRCSD_cnv_data.csv")
clinical = pd.read_csv(PRCD_DATA_PATH + "PRCSD_clinical_data.csv")
epi = pd.read_csv(PRCD_DATA_PATH + "PRCSD_epigenomic_data.csv")


In [None]:

image_list = glob.glob(PRCD_DATA_PATH + "images/*.jpg")
images  = pd.DataFrame(image_list)
images[0] = images[0].str.split("/").str[-1].str[:-4]

In [42]:
ids = list(set.intersection(*map(set,[clinical["case_id"].values,
                           trans["case_id"].values,
                           epi["case_id"].values,
                          cnv["case_id"].values, images[0].values])))

In [None]:
comb = clinical[clinical["case_id"].isin(ids)][["case_id", "y"]]

In [47]:
comb.to_csv(COMBINED_DATA_PATH + "combined_ids.csv", index = False)

Unnamed: 0,case_id,y
0,TCGA-55-A4DG,lung
1,TCGA-55-A492,lung
2,TCGA-75-7025,lung
3,TCGA-69-A59K,lung
0,TCGA-BP-4804,kidney
1,TCGA-BP-5202,kidney
2,TCGA-B8-5550,kidney
3,TCGA-BP-4986,kidney
4,TCGA-AS-3777,kidney
5,TCGA-A3-3317,kidney


In [None]:
ten_percent = round(0.1 * len(comb))

In [80]:
test = comb.sample(n=ten_percent)  

In [None]:
val = comb[~comb["case_id"].isin(test["case_id"].values)].sample(n=ten_percent)
val["y"].value_counts()

In [84]:
cases_taken = list(test["case_id"].values) + list(val["case_id"].values)

In [None]:
train = comb[~comb["case_id"].isin(cases_taken)].sample(frac = 1)
train["y"].value_counts()

In [86]:
val["split"] = "val"
test["split"] = "test"
train["split"] = "train"

In [98]:
splits = val.append(test).append(train)

In [89]:
splits.to_csv(COMBINED_DATA_PATH + "splits.csv", index = False)

In [74]:

save_path =  COMBINED_DATA_PATH + "split_data/"
id_path = COMBINED_DATA_PATH + "splits.csv"
# read in ids and data
id_df = pd.read_csv(id_path) #or splits
ids = id_df["case_id"].tolist()
categories = id_df["split"].tolist()

In [86]:
modalities = {"transcriptomic": trans, "cnv": cnv, "epigenomic": epi, "clinical": clinical}
for modality, data in modalities.items(): 
    
    # create new DataFrames for ttv sets 
    train = pd.DataFrame(columns = list(data.columns.values))
    test = pd.DataFrame(columns = list(data.columns.values))
    val = pd.DataFrame(columns = list(data.columns.values))
    
    id_name = "case_id"
    # For each case id:
    for i in range(0, len(ids)):
        case_id = ids[i]
        cat = categories[i]
    
        # access row of id in data and add row to corresponding dataframe
        if(cat == "train"):
            newrow = data.loc[data[id_name] == case_id]
            train = train.append(newrow, ignore_index=True)
        elif(cat == "test"):
            newrow = data.loc[data[id_name] == case_id]
            test = test.append(newrow, ignore_index=True)
        else:
            newrow = data.loc[data[id_name] == case_id]
            val = val.append(newrow, ignore_index=True)
            
    # Save new dataframes as csvs
    train.to_csv(save_path + modality + "_train.csv", index=False)
    test.to_csv(save_path + modality + "_test.csv", index=False)
    val.to_csv(save_path + modality + "_val.csv", index=False)
