# Deriving cancer gene's from expression data

See if it's possible to train a deep neural network tumor/normal binary classifier using just the Toil TCGA, TARGET and GTEX expression datasets:

https://xenabrowser.net/datapages/?host=https://toil.xenahubs.net)

If we can, then see whether any of the early layers re-capitulates the COSMIC cancer gene census:

http://cancer.sanger.ac.uk/census/

In [7]:
import os
import numpy as np
import pandas as pd

# fix random seed for reproducibility
np.random.seed(42)

In [27]:
# Before we import keras or tensorflow pick a GPU with zero utilitization and memory usage
import os
import re
import subprocess

if "CUDA_HOME" in os.environ:
    utilization = re.findall(r"Utilization.*?Gpu.*?(\d+).*?Memory.*?(\d+)",
                             subprocess.check_output(["nvidia-smi", "-q"]), flags=re.MULTILINE | re.DOTALL)
    print("GPU Utilization", utilization)

    if ('0', '0') in utilization:
        print("Using GPU Device:", utilization.index(('0', '0')))
        os.environ["CUDA_VISIBLE_DEVICES"] = str(utilization.index(('0', '0')))
        os.environ["CUDA_DEVICE_ORDER"]  = "PCI_BUS_ID"  # To ensure the index matches
    else:
        print("All GPUs in Use")
        exit

In [15]:
from keras.utils.io_utils import HDF5Matrix
import h5py

input_file = "data/tumor_normal.h5"

with h5py.File(input_file, "r") as f:
    print("Datasets:", list(f.keys()))

# different size parameters if debugging or using full dataset
if os.getenv("DEBUG", "True") == "True":
    X_train = HDF5Matrix(input_file, "X_train", start=0, end=1000)
    X_test = HDF5Matrix(input_file, "X_test", start=0, end=200)
    y_train = HDF5Matrix(input_file, "y_train", start=0, end=1000)
    y_test = HDF5Matrix(input_file, "y_test", start=0, end=200)
    print("Training on partial dataset")
    epochs=1
    batch_size=128
else:
    X_train = HDF5Matrix(input_file, "X_train")
    X_test = HDF5Matrix(input_file, "X_test")
    y_train = HDF5Matrix(input_file, "y_train")
    y_test = HDF5Matrix(input_file, "y_test")
    print("Training on full dataset")
    epochs=16
    batch_size=512
    
print("X_train.shape:", X_train.shape, "epochs:", epochs, "batch_size:", batch_size)

Datasets: ['X_test', 'X_train', 'class_labels', 'classes_test', 'classes_train', 'features', 'genes', 'labels', 'y_test', 'y_train']
Training on partial dataset
X_train.shape: (1000, 60498) epochs: 1 batch_size: 128


In [10]:
# """
# Basic PCA into Dense

# Epoch 16/16
# 15300/15300 [==============================] - 23s - loss: 0.6338 - acc: 0.9603
# """
# from sklearn.decomposition import PCA
# from keras.layers import Dense
# from keras.models import Model, Sequential

# # PCA to reduce dimenions before network
# print("Computing PCA")
# X_train_pca = PCA(n_components=5000).fit_transform(X_train)

# classify = [
#     Dense(1000, input_dim=X_train_pca.shape[1], activation='relu'),
#     Dense(500, activation='relu'),
#     Dense(1, activation='sigmoid')
# ]

# model = Sequential(classify)
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.fit(X_train_pca, y_train, epochs=epochs, batch_size=batch_size, shuffle="batch")
# # model.evaluate(X_test, y_test)

Epoch 1/1


<keras.callbacks.History at 0x7f8e2c153278>

In [19]:
"""
Try replacing PCA with a sparse autoencoder
"""
from keras import regularizers
from keras.layers import Dense
from keras.models import Model, Sequential

classify = [
    Dense(2000, input_dim=X_train.shape[1], activation='relu', 
          kernel_regularizer=regularizers.l1(0.05),
          activity_regularizer=regularizers.l1(0.1)),
    Dense(1000, activation='relu'),
    Dense(500, activation='relu'),
    Dense(1, activation='sigmoid')
]

model = Sequential(classify)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, shuffle="batch")
model.evaluate(X_test, y_test)

Epoch 1/1


[14205.6428125, 0.5]