# Ingest Pancan + GTEX into TFRecord

https://stackoverflow.com/questions/46820500/how-to-handle-large-amouts-of-data-in-tensorflow/47040165#47040165

https://medium.com/tensorflow/training-and-serving-ml-models-with-tf-keras-fd975cc0fa27

In [11]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf

!mkdir -p ~/data/pancan-gtex
os.chdir(os.path.expanduser("~/data/pancan-gtex"))

In [12]:
# with open("c2.cp.kegg.v6.2.symbols.gmt") as f:
#     genes = list(set().union(*[line.strip().split("\t")[2:] for line in f.readlines()]))
# print("Pruning to only include {} KEGG pathway genes".format(len(genes)))

genes = pd.read_table("cosmic_260818.tsv")["Gene Symbol"].values
print("Pruning to only include {} COSMIC pathway genes".format(len(genes)))

Pruning to only include 719 COSMIC pathway genes


In [13]:
ensemble_to_hugo = pd.read_table("ensemble_transcript_to_hugo.tsv", index_col="Gene name")
print("Found {} total transcripts".format(ensemble_to_hugo.shape[0]))
ensemble_to_hugo.head()

Found 224501 total transcripts


Unnamed: 0_level_0,Gene stable ID,Transcript stable ID,Gene stable ID version,Transcript stable ID version
Gene name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RF00100,ENSG00000276626,ENST00000612820,ENSG00000276626.1,ENST00000612820.1
RNU4-59P,ENSG00000201317,ENST00000364447,ENSG00000201317.1,ENST00000364447.1
SNORD114-2,ENSG00000200823,ENST00000363953,ENSG00000200823.1,ENST00000363953.1
MIR1249,ENSG00000221598,ENST00000408671,ENSG00000221598.3,ENST00000408671.3
RF00019,ENSG00000199595,ENST00000362725,ENSG00000199595.1,ENST00000362725.1


In [5]:
transcripts = ensemble_to_hugo.loc[ensemble_to_hugo.index.intersection(genes)]

In [6]:
len(set(ensemble_to_hugo.index.intersection(genes)))

702

In [7]:
transcripts = ensemble_to_hugo.loc[ensemble_to_hugo.index.intersection(genes)]["Transcript stable ID version"]
print("Including {} transcripts associated with {} genes".format(len(transcripts), len(genes)))

Including 212330 transcripts associated with 719 genes


In [8]:
for chunk in pd.read_table("gtex_Kallisto_tpm.gz", index_col=0, chunksize=1024):
    break

In [9]:
chunk.filter(items=transcripts, axis="index").shape

(1008, 7862)

In [10]:
# Prune X to only Cosmic Cancer Genes
print("Pruning to only COSMIC genes")
subset_of_genes = pd.read_table(os.path.expanduser("~/data/cosmic_260818.tsv"))["Gene Symbol"].values
    
pruned_X = X.drop(labels=(set(X.columns) - set(subset_of_genes)), axis=1)

# Order must match dataframe so we can use this as labels for shap
genes = list(pruned_X.columns.values)
print("Pruned expression to only include", len(genes), "genes")

Pruning to only COSMIC genes


FileNotFoundError: File b'/notebooks/data/cosmic_260818.tsv' does not exist

In [1]:
import sklearn.preprocessing

tumor_normal_encoder = sklearn.preprocessing.LabelEncoder().fit(Y["tumor_normal"])
primary_site_encoder = sklearn.preprocessing.LabelEncoder().fit(Y["_primary_site"])

with tf.python_io.TFRecordWriter(os.path.expanduser("~/data/pancan_gtex.tfrecord")) as writer:
    for (_, expression), (_, labels) in zip(pruned_X.iterrows(), Y.iterrows()):
        example = tf.train.Example(
            features=tf.train.Features(
                feature = {
                    "expression": tf.train.Feature(float_list=tf.train.FloatList(
                        value=expression.values)),
                    "tumor_normal": tf.train.Feature(int64_list=tf.train.Int64List(
                        value=tumor_normal_encoder.transform([labels["tumor_normal"]]))),
                    "primary_site": tf.train.Feature(int64_list=tf.train.Int64List(
                        value=primary_site_encoder.transform([labels["_primary_site"]]))),
                }))
        writer.write(example.SerializeToString())

NameError: name 'Y' is not defined

In [None]:
def create_model(input_shape, hyperparameters={"width": 64, "depth": 2, "penalty": 1e-5}):
    input_layer = tf.keras.Input(shape=(input_shape, ), name="input_layer")

    x = tf.keras.layers.BatchNormalization()(input_layer)
        
    for i in range(hyperparameters["depth"]):
        x = tf.keras.layers.Dense(hyperparameters["width"],
                                  activity_regularizer=tf.keras.regularizers.l1(
                                     hyperparameters["penalty"]), activation="relu")(x)
        x = tf.keras.layers.Dropout(0.5)(x)

#     tumor_normal_output = tf.keras.layers.Dense(1, activation="softmax", name="tumor_normal")(x)
    primary_site_output = tf.keras.layers.Dense(45, activation="softmax", name="primary_site")(x)
    
#     model = tf.keras.Model(inputs=input_layer, outputs=[tumor_normal_output, primary_site_output])
    model = tf.keras.Model(inputs=input_layer, outputs=primary_site_output)
#     model.compile(optimizer="adam",
#                   loss={"tumor_normal": "categorical_crossentropy",
#                         "primary_site": "categorical_crossentropy"})
    model.compile(optimizer="adam", loss={"primary_site": "categorical_crossentropy"})
    return model

model = create_model(input_shape=700)
model.summary()

In [None]:
def parse_one_example(example):
    features = {
        "expression": tf.FixedLenFeature([700], tf.float32),
        "tumor_normal": tf.FixedLenFeature([], tf.int64),
        "primary_site": tf.FixedLenFeature([], tf.int64)
    }
          
    example = tf.parse_single_example(example, features)
    return example["expression"], tf.one_hot(example["primary_site"], 45)

dataset = tf.data.TFRecordDataset(os.path.expanduser("~/data/pancan_gtex.tfrecord")).map(parse_one_example)
dataset = dataset.batch(32)

# Display one example 
# iterator = dataset.make_one_shot_iterator()
# expression, (tumor_normal, primary_site) = iterator.get_next()
# with tf.Session() as session:
#     X, y = session.run([expression, (tumor_normal, primary_site)])
#     print(X.shape, y[0].shape, y[1].shape)
#     print(X[0:5])
#     print(pruned_X.iloc[0].values[0:5])
# #     print(y)
#     print(X.shape)
# #     print(len(a), len(b), len(c))

model.fit(dataset, steps_per_epoch=100, epochs=1)