## In this notebook, we do feature reduction for gene expression (transcriptomic), CNV, and DNA Methylation (epigenomic) datasets. We then convert all five modalities into Pytorch tensors.

In [1]:
import sys
sys.path.append('../..')
import pandas as pd
import numpy as np
import pandas as pd
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from PIL import Image
from torchvision import transforms
from common_files.custom_sets import TCGA_TabDataset, TCGA_ImgDataset

In [None]:
COMBINED_DATA_PATH = ".../TCGA/combined/"

access_path =  COMBINED_DATA_PATH + "split_data/"
save_path =  access_path + "reduced/"
id_path = COMBINED_DATA_PATH + "splits.csv"
tensor_path =  COMBINED_DATA_PATH + "tensor_data/"

image_path = ".../TCGA/data_processed/images/"

id_order = pd.read_csv(id_path)

In [None]:
#Since transcriptomic, CNV, and Epigenomic datasets are large, we use a RandomForest as a feature reduciton method (only showing the RF the train data)
modalities = ["transcriptomic", "cnv", "epigenomic"] 

y_train = id_order[id_order["split"] == "train"]["y"]

for modality in modalities:
    train = pd.read_csv(access_path + modality + "_train.csv")
    test = pd.read_csv(access_path + modality + "_test.csv")
    val = pd.read_csv(access_path + modality + "_val.csv")

    case = "case_id"

    X_train = train.drop(columns=[case])
    for est in [50, 100, 150]:
        sel = SelectFromModel(RandomForestClassifier(n_estimators = est))
        sel.fit(X_train, y_train)
        selected_feat= X_train.columns[(sel.get_support())]

        new_train = pd.DataFrame().assign(case_id=train[case])
        new_test = pd.DataFrame().assign(case_id=test[case])
        new_val = pd.DataFrame().assign(case_id=val[case])

        for i in selected_feat:
            new_train[i]=train[i]
            new_test[i]=test[i]
            new_val[i]=val[i]

        print(len(new_train.columns))   
        print(len(new_test.columns))   
        print(len(new_val.columns))   

        new_train.to_csv(save_path + modality + "_" + str(est) +"_train.csv", index=False)
        new_test.to_csv(save_path + modality +"_" + str(est) + "_test.csv", index=False)
        new_val.to_csv(save_path + modality + "_" + str(est) +"_val.csv", index=False)

In [53]:
modalities_to_convert = ["transcriptomic", "cnv", "epigenomic", "transcriptomic", "clinical"] # no need to write images
sets = ["train", "test", "val"]
est = [50,100,150]

In [None]:
for m in modalities_to_convert:
    for s in sets:
        if m != "clinical":
            for e in est:
                dataset = TCGA_TabDataset(m, s, e, COMBINED_DATA_PATH)
                torch.save(dataset, tensor_path + m + "_" + str(e) + "_"  + s + "_inputs.pt") 
        else:
            dataset = TCGA_TabDataset(m, s, 0, COMBINED_DATA_PATH)
            torch.save(dataset, tensor_path + m + "_" + s + "_inputs.pt") 

In [146]:
id_order.y = id_order.y.map(dict(lung =1, kidney =3, liver =2, stomach=1, colon=0)).astype(int)
ids = id_order["case_id"].tolist()
cat = id_order["split"].tolist()
c_type = id_order["y"].tolist()

In [148]:

id_order["path"] = image_path + id_order["case_id"] + ".jpg"

In [154]:
def save_data(data, split):
    path = tensor_path + "image" +"_"+ split + "_" + "inputs.pt"
    torch.save(data, path)

In [156]:
for split in ["train", "test", "val"]:
    
    df = id_order[id_order["split"] == split].reset_index(drop=True)
        
    img_input = TCGA_ImgDataset(
        data_frame= df,
        transform=transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
                    transforms.Resize((120, 160))])
    )
    save_data(img_input, split)