## Preprocess the clinical dataset

In [1]:
import pandas as pd
import numpy as np

The clinical data can be obtained by viewing one's cart on the GDC client, and selecting the "Clinical" download icon. The downloaded folder contains three .tsv files, one of which is "clinical.tsv". First, define the path to our clinical data and where to store our processed dataset.

In [None]:
def create_clinical_csv(data_path):
    clinical_data_raw = pd.read_csv(data_path, sep = '\t')
    clinical_data_raw = clinical_data_raw.drop_duplicates(subset='case_submitter_id', keep="last")
    #create target variable
    clinical_data_raw["y"] = clinical_data_raw["project_id"].map({"TCGA-LIHC":"liver", "TCGA-STAD":"stomach", "TCGA-COAD" : "colon","TCGA-KIRC":"kidney", "TCGA-LUAD":"lung" })
    # fill in NAs for extraneous values
    clinical_data_nas=clinical_data_raw.mask(clinical_data_raw == '\'--')
    # filter out columns that have NAs
    clinical_data_no_nas = clinical_data_nas.loc[:, ~clinical_data_nas.isnull().any()].reset_index(drop = True)
    #IMPORTANT - when combining multiple TCGA projects (kidney, lung, stomach, etc.), there will be columns that have those labels in them.
    #Before expanding the categorical columns, you need to drop columns that will indicate the label and give away the prediction task.
    drop_columns = ["tissue_or_organ_of_origin", "site_of_resection_or_biopsy"]
    clinical_data_no_nas = clinical_data_no_nas.drop(drop_columns,axis=1)
    #There are still several features that have very little variation (same labels across most cases). 
    #We can determine coefficients of variation for the categorical features to determine ones that may be more informative for analysis.
    cvs = {}
    for column in clinical_data_no_nas.columns:
        if clinical_data_no_nas[column].dtype == 'object':
            frequencies = clinical_data_no_nas[column].value_counts()
            std_dev = frequencies.std()
            mean = frequencies.mean()
            CV = (std_dev / mean) * 100
            cvs[column] = CV
    #plt.barh(range(len(cvs)), list(cvs.values()), tick_label=list(cvs.keys())) #optional to plot the distributions
    categorical = []
    #We can determine a specific threshold (here it is 30) to isolate variable features. 
    #Note that if the threshold is close to 20, then it could pick-up the "project_id" and "y" columns, which we do not want. 
    for var in cvs:
        if cvs[var] > 30:
            categorical.append(var)
    print(categorical)
    
    dfs = []
    for col in categorical:
        dfs.append(pd.get_dummies(clinical_data_no_nas[col], prefix = col, drop_first = True))
    categorical_dummies = pd.concat(dfs, axis = 1)
    c_data = categorical_dummies.assign(case_id=clinical_data_no_nas["case_submitter_id"], y=clinical_data_no_nas["y"])
    return c_data



In [2]:
CLINICAL_DATA_PATH = ".../TCGA/clinical.tsv"
DESTINATION_DATA_PATH = ".../TCGA/data_processed/PRCSD_clinical_data.csv"
clinical_data = create_clinical_csv(CLINICAL_DATA_PATH)
clinical_data.to_csv(DESTINATION_DATA_PATH, index = False)