# Ingest Pancan + GTEX

Ingest Pancan and GTEX labels (clinical) and features (expression)

In [1]:
import os
import numpy as np
import pandas as pd

## Ingest Samples

In [2]:
!mkdir -p ~/data/pancan-gtex
os.chdir(os.path.expanduser("~/data/pancan-gtex"))

!wget -q -N https://toil.xenahubs.net/download/tcga_Kallisto_tpm.gz
!wget -q -N https://toil.xenahubs.net/download/gtex_Kallisto_tpm.gz

In [3]:
%%time
# Convert to float32, Transpose to ML style rows = samples and hdf for significantly faster reading
if not os.path.exists(os.path.expanduser("tcga_Kallisto_tpm.T.fp32.h5")):
    pd.read_table(os.path.expanduser("tcga_Kallisto_tpm.gz"), index_col=0, engine='c') \
        .astype(np.float32).T \
        .to_hdf(os.path.expanduser("tcga_Kallisto_tpm.T.fp32.h5"), "expression", mode="w", format="fixed")
if not os.path.exists(os.path.expanduser("gtex_Kallisto_tpm.T.fp32.h5")):
    pd.read_table(os.path.expanduser("gtex_Kallisto_tpm.gz"), index_col=0, engine='c') \
        .astype(np.float32).T \
        .to_hdf(os.path.expanduser("gtex_Kallisto_tpm.T.fp32.h5"), "expression", mode="w", format="fixed")

CPU times: user 41min 37s, sys: 2min 38s, total: 44min 16s
Wall time: 44min 15s


In [4]:
%%time
# if os.environ["DEBUG"]:
#     print("Loading subset of samples for debugging")
#     tcga_samples = pd.read_hdf(os.path.expanduser("~/data/tcga_Kallisto_tpm.T.fp32.h5"), start=0, stop=100)
#     gtex_samples = pd.read_hdf(os.path.expanduser("~/data/gtex_Kallisto_tpm.T.fp32.h5"), start=0, stop=100)
# else:
tcga_samples = pd.read_hdf(os.path.expanduser("tcga_Kallisto_tpm.T.fp32.h5"))
gtex_samples = pd.read_hdf(os.path.expanduser("gtex_Kallisto_tpm.T.fp32.h5"))

# Make sure they have the exact same set of transcript names
assert tcga_samples.columns.equals(gtex_samples.columns)

CPU times: user 1.36 s, sys: 14.9 s, total: 16.3 s
Wall time: 16.3 s


In [5]:
# Combine into a single dataset and convert back into TPM
all_samples = pd.concat([tcga_samples, gtex_samples], axis="index").apply(np.exp2).subtract(0.001).clip_lower(0)
print("Ingested {} samples with {} features".format(all_samples.shape[0], all_samples.shape[1]))
all_samples.head()

Ingested 18525 samples with 197044 features


sample,ENST00000548312.5,ENST00000527779.1,ENST00000454820.5,ENST00000535093.1,ENST00000346219.7,ENST00000570899.1,ENST00000557761.1,ENST00000625998.2,ENST00000583693.5,ENST00000383738.6,...,ENST00000380620.8,ENST00000548698.5,ENST00000542429.2,ENST00000602837.1,ENST00000422233.5,ENST00000377138.1,ENST00000463473.2,ENST00000380293.3,ENST00000288710.6,ENST00000250055.2
TCGA-E9-A1N3-01,0.227062,0.0,0.0,1.76563,0.000208,0.090963,0.0,1.403153,0.273409,0.01403,...,0.043985,0.0,0.0,0.0,0.386267,0.0,0.024769,0.0,0.347058,0.384339
TCGA-EL-A3ZP-01,0.40815,0.886119,0.0,0.237821,0.486171,0.243753,0.0,2.931843,0.876822,0.007793,...,0.14727,0.0,0.0,0.111492,0.0,0.0,0.115896,0.09581,0.032398,0.371729
TCGA-E2-A152-01,0.101443,0.30587,0.128104,0.180419,0.017127,0.146706,0.0,0.0,0.182633,0.042321,...,7.39323,0.0,0.0,0.44673,0.0,0.0,0.0,0.0,0.042321,0.0
TCGA-66-2734-01,0.215029,1.765263,0.0,0.165132,0.036271,0.102213,0.0,0.175764,0.466617,0.017338,...,0.102135,0.0,0.541126,0.556174,0.0,0.0,0.074598,0.0,0.054823,21.661081
TCGA-BQ-5885-01,0.072516,0.0,0.0,0.128417,0.0,0.06533,0.0,3.001991,0.407951,0.0,...,0.48462,0.0,0.0,0.216381,0.0,0.0,0.0,0.694537,0.136738,0.310586


In [6]:
# Check that TPM sum to 1M
all_samples.iloc[::all_samples.shape[0]//5].sum(axis=1)

TCGA-E9-A1N3-01                 1.000000e+06
TCGA-CR-5249-01                 9.999975e+05
TCGA-27-1835-01                 1.000000e+06
GTEX-117YX-1426-SM-5H12H        9.999982e+05
GTEX-12ZZX-0011-R3b-SM-5EGLH    9.999986e+05
dtype: float32

## Convert Ensembl to Hugo

Kallisto outputs expression by Ensemble transcript. Convert to Hugo as that is what most pathway databases use as well as the primary way the clinical world talks about drug targeting etc...

Source of reference: ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_23/gencode.v23.chr_patch_hapl_scaff.annotation.gtf.gz

Alternate locations:

http://uswest.ensembl.org/biomart/martview

http://www.gencodegenes.org/releases/

http://www.genenames.org/

In [7]:
!wget -q -N http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_23/gencode.v23.chr_patch_hapl_scaff.annotation.gtf.gz

In [8]:
import re
import gzip
matches = re.findall(r'transcript_id\W+?"(.*?)".*?gene_name\W+?"(.*?)"', 
                     gzip.open("gencode.v23.chr_patch_hapl_scaff.annotation.gtf.gz", "rt").read(), 
                     flags=re.MULTILINE)
ensemble_to_hugo = pd.DataFrame(
    list(set(matches)), columns=["transcript", "gene"]).set_index("transcript")

In [9]:
missing = list(filter(lambda x: x not in ensemble_to_hugo.index, all_samples.columns.values))
print("{} transcripts have no hugo mapping".format(len(missing)))

0 transcripts have no hugo mapping


In [10]:
# Create a new dataframe replacing the index
hugo_samples = all_samples.copy()
hugo_samples.columns = ensemble_to_hugo.reindex(all_samples.columns).gene.values
hugo_samples.head()

Unnamed: 0,HVCN1,DCUN1D5,MSH5,ELP2,DLEC1,PHF23,RP11-81F13.1,PROSER1,ASPSCR1,CSPG5,...,B3GALT5,CSAD,RP11-108O10.8,UBE2E3,TMEM196,OR10C1,FCMR,AVP,DRC1,SOX15
TCGA-E9-A1N3-01,0.227062,0.0,0.0,1.76563,0.000208,0.090963,0.0,1.403153,0.273409,0.01403,...,0.043985,0.0,0.0,0.0,0.386267,0.0,0.024769,0.0,0.347058,0.384339
TCGA-EL-A3ZP-01,0.40815,0.886119,0.0,0.237821,0.486171,0.243753,0.0,2.931843,0.876822,0.007793,...,0.14727,0.0,0.0,0.111492,0.0,0.0,0.115896,0.09581,0.032398,0.371729
TCGA-E2-A152-01,0.101443,0.30587,0.128104,0.180419,0.017127,0.146706,0.0,0.0,0.182633,0.042321,...,7.39323,0.0,0.0,0.44673,0.0,0.0,0.0,0.0,0.042321,0.0
TCGA-66-2734-01,0.215029,1.765263,0.0,0.165132,0.036271,0.102213,0.0,0.175764,0.466617,0.017338,...,0.102135,0.0,0.541126,0.556174,0.0,0.0,0.074598,0.0,0.054823,21.661081
TCGA-BQ-5885-01,0.072516,0.0,0.0,0.128417,0.0,0.06533,0.0,3.001991,0.407951,0.0,...,0.48462,0.0,0.0,0.216381,0.0,0.0,0.0,0.694537,0.136738,0.310586


In [11]:
# Multiple Ensemble genes map to the same Hugo name. Each of these values has been normalized via log2(TPM+0.001)
# so we convert back into TPM to sum.
reduced_samples = hugo_samples.groupby(axis="columns", level=0).aggregate(np.sum)
print("After collapsing genes we have {} features".format(reduced_samples.shape[1]))

# Check that we still sum to 1M
reduced_samples.iloc[::reduced_samples.shape[0]//5].sum(axis="columns")

After collapsing genes we have 44792 features


TCGA-E9-A1N3-01                 1000000.125
TCGA-CR-5249-01                  999997.625
TCGA-27-1835-01                 1000000.125
GTEX-117YX-1426-SM-5H12H         999998.125
GTEX-12ZZX-0011-R3b-SM-5EGLH     999998.625
dtype: float32

## Ingest Labels

In [12]:
!wget -q -N https://pancanatlas.xenahubs.net/download/Survival_SupplementalTable_S1_20171025_xena_sp.gz
!wget -q -N https://toil.xenahubs.net/download/TcgaTargetGTEX_phenotype.txt.gz

In [13]:
survival_labels = pd.read_table(
    "Survival_SupplementalTable_S1_20171025_xena_sp.gz", compression="gzip", 
    header=0, sep="\t", encoding="ISO-8859-1", index_col=0, dtype="str").sort_index(axis="index")

In [14]:
tcga_gtex_labels = pd.read_table(
    "TcgaTargetGTEX_phenotype.txt.gz", compression="gzip", 
    header=0, sep="\t", encoding="ISO-8859-1", index_col=0, dtype="str").sort_index(axis="index")

In [15]:
all_labels = pd.merge(tcga_gtex_labels, survival_labels, left_index=True, right_index=True, how="outer").astype('str')
print("Ingested {} labels for {} samples".format(all_labels.shape[1], all_labels.shape[0]))
all_labels.iloc[::all_labels.shape[0]//5]

Ingested 39 labels for 21226 samples


Unnamed: 0_level_0,detailed_category,primary disease or tissue,_primary_site,_sample_type,_gender,_study,_PATIENT,cancer type abbreviation,age_at_initial_pathologic_diagnosis,gender,...,residual_tumor,OS,OS.time,DSS,DSS.time,DFI,DFI.time,PFI,PFI.time,Redaction
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GTEX-1117F-0226-SM-5GZZ7,Adipose - Subcutaneous,Adipose - Subcutaneous,Adipose Tissue,Normal Tissue,Female,GTEX,,,,,...,,,,,,,,,,
GTEX-QDVN-2126-SM-33HBS,Adipose - Subcutaneous,Adipose - Subcutaneous,Adipose Tissue,Normal Tissue,Male,GTEX,,,,,...,,,,,,,,,,
TARGET-50-PAJNAA-01,Wilms Tumor,Wilms Tumor,Kidney,Primary Solid Tumor,,TARGET,,,,,...,,,,,,,,,,
TCGA-AW-A1PO-01,,,,,,,TCGA-AW-A1PO,UCEC,66.0,FEMALE,...,,0.0,17.0,0.0,17.0,,,0.0,17.0,
TCGA-EI-6508-01,Rectum Adenocarcinoma,Rectum Adenocarcinoma,Rectum,Primary Tumor,Female,TCGA,TCGA-EI-6508,READ,48.0,FEMALE,...,,0.0,636.0,0.0,636.0,,,0.0,636.0,
TCGA-ZX-AA5X-01,Cervical & Endocervical Cancer,Cervical & Endocervical Cancer,Cervix,Primary Tumor,Female,TCGA,TCGA-ZX-AA5X,CESC,64.0,FEMALE,...,,0.0,119.0,0.0,119.0,,,0.0,119.0,


## Wrangle and Prune

Drop samples with missing values for fields we want to train on, transform field values for training etc...

In [16]:
# Include only labels for samples that we have
pruned_labels = all_labels.loc[all_labels.index.intersection(all_samples.index)]
print("Starting with {} labeled sample pairs".format(pruned_labels.shape[0]))

# Drop that are missing labels we plan to classify
pruned_labels = pruned_labels.dropna(subset=["_primary_site"])
print(pruned_labels.shape[0], "with _primary_site")
pruned_labels = pruned_labels.dropna(subset=["_gender"])
print(pruned_labels.shape[0], "with _gender")

# Some of the cell line are normal and in any case not clear they are reliable signal
pruned_labels = pruned_labels[pruned_labels._sample_type != "Cell Line"]
print(pruned_labels.shape[0], "not Cell Line")

# Generate a Tumor/Normal label
pruned_labels = pruned_labels.dropna(subset=["_sample_type"])
print(pruned_labels.shape[0], "with _sample_type")
pruned_labels["tumor_normal"] = pruned_labels.apply(
    lambda row: "Normal" if row["_sample_type"] in ["Normal Tissue", "Solid Tissue Normal"]
    else "Tumor", axis=1)

print("{} labels after pruning".format(pruned_labels.shape[0]))
pruned_labels.iloc[::pruned_labels.shape[0]//5]

Starting with 18397 labeled sample pairs
18397 with _primary_site
18397 with _gender
17964 not Cell Line
17964 with _sample_type
17964 labels after pruning


Unnamed: 0,detailed_category,primary disease or tissue,_primary_site,_sample_type,_gender,_study,_PATIENT,cancer type abbreviation,age_at_initial_pathologic_diagnosis,gender,...,OS,OS.time,DSS,DSS.time,DFI,DFI.time,PFI,PFI.time,Redaction,tumor_normal
GTEX-1117F-0226-SM-5GZZ7,Adipose - Subcutaneous,Adipose - Subcutaneous,Adipose Tissue,Normal Tissue,Female,GTEX,,,,,...,,,,,,,,,,Normal
GTEX-OHPK-0326-SM-2HMJO,Heart - Left Ventricle,Heart - Left Ventricle,Heart,Normal Tissue,Female,GTEX,,,,,...,,,,,,,,,,Normal
GTEX-ZVT4-1026-SM-57WC4,Breast - Mammary Tissue,Breast - Mammary Tissue,Breast,Normal Tissue,Female,GTEX,,,,,...,,,,,,,,,,Normal
TCGA-BB-7871-01,Head & Neck Squamous Cell Carcinoma,Head & Neck Squamous Cell Carcinoma,Head and Neck region,Primary Tumor,Female,TCGA,TCGA-BB-7871,HNSC,64.0,FEMALE,...,0.0,750.0,0.0,750.0,,,1.0,428.0,,Tumor
TCGA-ET-A3DV-01,Thyroid Carcinoma,Thyroid Carcinoma,Thyroid Gland,Primary Tumor,Female,TCGA,TCGA-ET-A3DV,THCA,68.0,FEMALE,...,0.0,5068.0,0.0,5068.0,0.0,5068.0,0.0,5068.0,,Tumor
TCGA-ZT-A8OM-01,Thymoma,Thymoma,Thymus,Primary Tumor,Female,TCGA,TCGA-ZT-A8OM,THYM,73.0,FEMALE,...,0.0,1398.0,0.0,1398.0,,,0.0,1398.0,,Tumor


## Export

Export the full dataset as an h5 file.

In [17]:
%%time
# Include only ids that are in reduced_samples and pruned_labels
# REMIND: Should we store as a sparse matric?
# https://www.hdfgroup.org/2018/06/hdf5-or-how-i-learned-to-love-data-compression-and-partial-i-o
# https://stackoverflow.com/questions/43390038/storing-scipy-sparse-matrix-as-hdf5
sample_ids = reduced_samples.index.intersection(pruned_labels.index)
print("Exporting {} samples".format(len(sample_ids)))

# NOTE: Setting complevel to 9 reduces the size of the resulting h5 file from 3G down to 2.1G
# but increases the read time from 2.79s to 20.8s and the write time from 19.9s to 25m
reduced_samples.loc[sample_ids].sort_index(axis="index").sort_index(axis="columns").to_hdf(
    os.path.expanduser("pancan_gtex.h5"), key="samples", mode="w", format="fixed", complevel=0)
pruned_labels.loc[sample_ids].sort_index(axis="index").sort_index(axis="columns").to_hdf(
    os.path.expanduser("pancan_gtex.h5"), key="labels", mode="a", format="fixed", complevel=0)

Exporting 17964 samples
CPU times: user 50.3 s, sys: 15.6 s, total: 1min 5s
Wall time: 1min 5s
