In [1]:
import pandas as pd
import numpy as np
import re
import os
import requests

pd.options.display.max_rows=None
pd.options.display.max_columns=None

In [2]:
# Download from https://gdc.cancer.gov/about-data/publications/pancanatlas:
# Clinical with Follow-up - clinical_PANCAN_patient_with_followup.tsv
pancan_data = pd.read_table("clinical_PANCAN_patient_with_followup.tsv", sep='\t', na_values='', dtype=object, encoding = "cp1252")
print('Shape of input file %s' % (str(pancan_data.shape)))

# Check data
pancan_data[['acronym', 'history_of_colon_polyps']].dropna(axis=0)


Shape of input file (10956, 746)


Unnamed: 0,acronym,history_of_colon_polyps
1947,COAD,YES
1948,COAD,NO
1949,COAD,NO
1950,COAD,NO
1951,COAD,NO
1952,COAD,NO
1953,COAD,YES
1954,COAD,YES
1955,COAD,YES
1956,COAD,NO


In [30]:
# Check column names
pancan_data.columns.tolist()

['bcr_patient_uuid',
 'bcr_patient_barcode',
 'acronym',
 'gender',
 'vital_status',
 'days_to_birth',
 'days_to_death',
 'days_to_last_followup',
 'days_to_initial_pathologic_diagnosis',
 'age_at_initial_pathologic_diagnosis',
 'icd_10',
 'tissue_retrospective_collection_indicator',
 'icd_o_3_histology',
 'tissue_prospective_collection_indicator',
 'history_of_neoadjuvant_treatment',
 'icd_o_3_site',
 'tumor_tissue_site',
 'new_tumor_event_after_initial_treatment',
 'radiation_therapy',
 'race',
 'project_code',
 'prior_dx',
 'disease_code',
 'ethnicity',
 'informed_consent_verified',
 'person_neoplasm_cancer_status',
 'patient_id',
 'year_of_initial_pathologic_diagnosis',
 'histological_type',
 'tissue_source_site',
 'form_completion_date',
 'pathologic_T',
 'pathologic_M',
 'clinical_M',
 'pathologic_N',
 'system_version',
 'pathologic_stage',
 'stage_other',
 'clinical_stage',
 'clinical_T',
 'clinical_N',
 'extranodal_involvement',
 'postoperative_rx_tx',
 'primary_therapy_outcome

In [29]:
for study in pancan_data.acronym.unique():
# study = 'ACC'
    study_data = pancan_data.loc[pancan_data.acronym == study, ].copy()

    study_data.dropna(axis=1, inplace=True, how='all')

    # Print shape of clinical data file per cancer type
    print('Shape of %s: %s' % (study, study_data.shape))

    # Create header
    study_header = pd.DataFrame()

    # Replace values
    study_header[0] = pd.Series(study_data.columns)
    study_header[1] = pd.Series(study_data.columns)

    # Infer data type for columns. 
    # Here we remap the different values for NA to a single value, and read in the file again. 
    # Originally we read in all data as strings, using "dtype=object", but here we want to know 
    # whether values are strings or numbers. This way we can create the "Data type" header line.
    inferred_clinical_data = study_data.copy().replace(['[Not Available]', '[Not Applicable]'], np.nan)
    inferred_clinical_data.to_csv('data_clin_patient_tmp.tsv', sep='\t',header=True, index=False)
    inferred_clinical_data = pd.read_table('data_clin_patient_tmp.tsv')

    # Data type header line
    type_values = []
    for attribute in study_data:
        if inferred_clinical_data[attribute].isnull().all():
            type_values.append("STRING")
        elif np.issubdtype(inferred_clinical_data.dtypes[attribute], np.number):
            type_values.append("NUMBER")
        else:
            type_values.append("STRING")

    # Create attribute type
    study_header[2] = type_values

    # Set attribute priority
    study_header[3] = 1

    # Transpose to put in correct format
    study_header = study_header.transpose()

    # Transform to dataframe
    study_header = pd.DataFrame(study_header)
    study_header.columns = study_data.columns

    # Adding hash # to the first column
    study_header.iloc[:, 0] = '#' + study_header.iloc[:, 0].astype(str)

    # Remap column names
    remapping_dictionary = {'bcr_patient_uuid': 'PATIENT_ID',
                            'vital_status': 'OS_STATUS'}
    study_data.rename(columns=remapping_dictionary, inplace=True)
    study_header.rename(columns=remapping_dictionary, inplace=True)
    study_data = study_data.rename(columns=lambda s: re.sub('[^0-9a-zA-Z_]+', '_', s))
    study_data = study_data.rename(columns=lambda x: x.strip('_'))
    study_data.columns = study_data.columns.str.upper()

    # Remap values
    if 'OS_STATUS' in study_data.columns:
        study_data['OS_STATUS'] = study_data['OS_STATUS'].replace('Dead', 'DECEASED')
        study_data['OS_STATUS'] = study_data['OS_STATUS'].replace('Alive', 'LIVING')

    # Write output
    output_dir = os.path.join(os.getcwd(), '%s_tcga_pan_can_atlas_2018' % study.lower())
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    study_filename = os.path.join(output_dir, 'data_clinical_patient.txt')    
    study_header.to_csv(study_filename, sep='\t', index=False, header=False, mode='w')
    study_data.to_csv(study_filename, sep='\t', index=False, header=True, mode='a')

Shape of input file (10956, 746)
Shape of ACC: (92, 85)
Shape of BLCA: (412, 88)
Shape of BRCA: (1099, 137)
Shape of CESC: (308, 153)
Shape of CHOL: (36, 83)
Shape of COAD: (459, 100)
Shape of DLBC: (48, 146)
Shape of ESCA: (185, 103)
Shape of GBM: (596, 40)
Shape of HNSC: (528, 74)
Shape of KICH: (113, 59)
Shape of KIRC: (537, 65)
Shape of KIRP: (291, 67)
Shape of LGG: (515, 73)
Shape of LIHC: (377, 79)
Shape of LUAD: (522, 133)
Shape of LUSC: (504, 131)
Shape of MESO: (87, 70)
Shape of OV: (587, 77)
Shape of PAAD: (185, 96)
Shape of PCPG: (179, 43)
Shape of PRAD: (500, 77)
Shape of READ: (171, 79)
Shape of SARC: (261, 69)
Shape of SKCM: (471, 85)
Shape of STAD: (443, 89)
Shape of TGCT: (134, 89)
Shape of THCA: (507, 78)
Shape of THYM: (124, 50)
Shape of UCEC: (548, 77)
Shape of UCS: (57, 69)
Shape of UVM: (80, 69)


In [40]:
Print shape of data as it is currently in datahub 
for study in pancan_data.acronym.unique():
    study_data = pd.read_table('/Users/sander/Data/datahub/public/%s_tcga_pan_can_atlas_2018/data_clinical_patient.txt' % study.lower(), skiprows=4, dtype=object)
    print('Shape of %s: %s' % (study, study_data.shape))

Shape of ACC: (92, 30)
Shape of BLCA: (411, 30)
Shape of BRCA: (1084, 30)
Shape of CESC: (297, 30)
Shape of CHOL: (36, 30)
Shape of COAD: (439, 30)
Shape of DLBC: (48, 30)
Shape of ESCA: (182, 30)
Shape of GBM: (585, 30)
Shape of HNSC: (523, 30)
Shape of KICH: (65, 30)
Shape of KIRC: (512, 30)
Shape of KIRP: (283, 30)
Shape of LGG: (514, 30)
Shape of LIHC: (372, 30)
Shape of LUAD: (566, 30)
Shape of LUSC: (487, 30)
Shape of MESO: (87, 30)
Shape of OV: (585, 30)
Shape of PAAD: (184, 30)
Shape of PCPG: (178, 30)
Shape of PRAD: (494, 30)
Shape of READ: (155, 30)
Shape of SARC: (255, 30)
Shape of SKCM: (442, 30)
Shape of STAD: (440, 30)
Shape of TGCT: (149, 30)
Shape of THCA: (499, 30)
Shape of THYM: (123, 30)
Shape of UCEC: (529, 30)
Shape of UCS: (57, 30)
Shape of UVM: (80, 30)
