# Classifying Tumor vs. Normal from Gene Expression

See if it's possible to train a deep neural network tumor/normal binary classifier using just the Toil TCGA, TARGET and GTEX expression data (see ingest.ipynb for details on these data)

In [5]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import keras

# fix random seed for reproducibility
np.random.seed(42)

In [6]:
%%time
# Load training set
X = pd.read_hdf("data/tcga_target_gtex.h5", "expression")
Y = pd.read_hdf("data/tcga_target_gtex.h5", "labels")

CPU times: user 225 ms, sys: 23.1 s, total: 23.3 s
Wall time: 25.8 s


In [7]:
# Convert tumor_normal and primary_site into numerical values for two-hot multi-class training
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
Y["tumor_normal_value"] = pd.Series(encoder.fit_transform(Y["tumor_normal"]), index=Y.index)
encoder = LabelEncoder()
Y["primary_site_value"] = pd.Series(encoder.fit_transform(Y["primary_site"]), index=Y.index)
Y.describe(include="all", percentiles=[])

Unnamed: 0,category,disease,primary_site,sample_type,gender,study,tumor_normal,tumor_normal_value,primary_site_value
count,19126,19126,19126,19126,19126,19126,19126,19126.0,19126.0
unique,93,93,46,16,3,3,2,,
top,Breast Invasive Carcinoma,Breast Invasive Carcinoma,Brain,Primary Tumor,Male,TCGA,Tumor,,
freq,1212,1212,1846,9185,10453,10534,10530,,
mean,,,,,,,,0.550559,20.651992
std,,,,,,,,0.49745,12.419634
min,,,,,,,,0.0,0.0
50%,,,,,,,,1.0,19.0
max,,,,,,,,1.0,45.0


In [8]:
# Split into stratified training and test sets based on classes (i.e. tissue type) so that we have equal
# proportions of each tissue type in the train and test sets
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(X.values, Y["primary_site_value"]):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = Y["tumor_normal_value"][train_index], Y["tumor_normal_value"][test_index]
    classes_train, classes_test = Y["primary_site_value"].values[train_index], Y["primary_site_value"].values[test_index]

print(X_train.shape, X_test.shape)

(15300, 58581) (3826, 58581)


In [9]:
"""
Batch normalization with a sparse layer.
"""
from keras.models import Model, Sequential
from keras.layers import InputLayer, Dense, BatchNormalization, Activation, Dropout
from keras.callbacks import EarlyStopping
from keras import regularizers

epochs = 2
batch_size = 128

classify = [
    InputLayer(input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    
    Dense(1000),
    BatchNormalization(),
    Activation('relu'),

    Dense(500, activity_regularizer=regularizers.l1(1e-5)),
    BatchNormalization(),
    Activation('relu'),
    
    Dense(1),
    Activation('sigmoid')
]

model = Sequential(classify)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

callbacks=[EarlyStopping(monitor='acc', min_delta=0.05, patience=2, verbose=2, mode="max")]

model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, shuffle="batch", callbacks=callbacks)

print(model.metrics_names, model.evaluate(X_test, y_test))

Epoch 1/2
Epoch 2/2
['loss', 'acc'] [0.0878353332842652, 0.9827496079456352]


In [10]:
"""
Save model and weights so we can copy them back from the GPU machine
to visualize and evaluate locally.
"""
model_json = model.to_json()
if not os.path.exists("models"):
    os.makedirs("models")
with open("models/model.json", "w") as f:
    f.write(model.to_json())

model.save_weights("models/weights.h5", overwrite=True)
print("Saved to model.json and weights.h5")

OSError: Unable to create file (unable to lock file, errno = 37, error message = 'No locks available')