# Train and Optimize

Train a neural network on TCGA+GTEX gene expression to classify tissue/disease

In [1]:
import sys
import os
import json
import pandas as pd
import numpy as np

# fix random seed for reproducibility
np.random.seed(42)

## Load and Wrangle Data

In [2]:
X = pd.read_hdf(os.path.expanduser("~/data/pancan_gtex.h5"), "samples")
Y = pd.read_hdf(os.path.expanduser("~/data/pancan_gtex.h5"), "labels")
print("Loaded {} samples with {} features and {} labels".format(X.shape[0], X.shape[1], Y.shape[1]))

Loaded 17964 samples with 42326 features and 42 labels


In [3]:
# Prune X to only KEGG pathway genes
# with open(os.path.expanduser("~/data/msigdb/c2.cp.kegg.v6.2.symbols.gmt")) as f:
#     subset_of_genes = list(set().union(*[line.strip().split("\t")[2:] for line in f.readlines()]))
# print("Pruning to only include KEGG pathway genes")

# Prune X to only Cosmic Cancer Genes
print("Pruning to only COSMIC genes")
subset_of_genes = pd.read_table("../data/cosmic_260818.tsv")["Gene Symbol"].values
    
pruned_X = X.drop(labels=(set(X.columns) - set(subset_of_genes)), axis=1)

# Order must match dataframe so we can use this as labels for shap
genes = list(pruned_X.columns.values)
print("Pruned expression to only include", len(genes), "genes")

# Create a multi-label one-hot for tumor/normal and primary site
from sklearn.preprocessing import LabelEncoder

primary_site_encoder = LabelEncoder()
Y["primary_site_value"] = pd.Series(
    primary_site_encoder.fit_transform(Y["_primary_site"]), index=Y.index, dtype='int32')

tumor_normal_encoder = LabelEncoder()
Y["tumor_normal_value"] = pd.Series(
    tumor_normal_encoder.fit_transform(Y["tumor_normal"]), index=Y.index, dtype='int32')

from keras.utils import np_utils
Y_onehot = np.append(
    Y["tumor_normal_value"].values.reshape(Y.shape[0],-1), 
    np_utils.to_categorical(Y["primary_site_value"]), axis=1)

Pruning to only COSMIC genes
Pruned expression to only include 700 genes


Using TensorFlow backend.


In [4]:
# Split into training and test sets strattified on primary site
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(X.values, Y.primary_site_value):
    X_train = pruned_X.values[train_index]
    X_test = pruned_X.values[test_index]
    Y_train = Y.iloc[train_index]
    Y_test = Y.iloc[test_index]
    Y_onehot_train = Y_onehot[train_index]
    Y_onehot_test = Y_onehot[test_index]
    
print("Train:", X_train.shape, "Test:", X_test.shape)

Train: (14371, 700) Test: (3593, 700)


In [5]:
# import matplotlib.pyplot as plt

# # Lets see how big each class is based on primary site
# plt.hist(Y_train.primary_site_value.values, alpha=0.5, label='Train')
# plt.hist(Y_test.primary_site_value.values, alpha=0.5, label='Test')
# plt.legend(loc='upper right')
# plt.title("Primary site distribution between train and test sets")
# plt.show()

# # Lets see how big each class is based tumor/normal
# plt.hist(Y_train.tumor_normal_value.values, alpha=0.5, label='Train')
# plt.hist(Y_test.tumor_normal_value.values, alpha=0.5, label='Test')
# plt.legend(loc='upper right')
# plt.title("Tumor/normal distribution between train and test sets")
# plt.show()

## Build and Train Model

In [6]:
from keras.models import Model
from keras.layers import Input, BatchNormalization, Dense, Dropout
from keras.callbacks import EarlyStopping
from keras import regularizers

def create_model(input_shape, output_shape, params):
    inputs = Input(shape=(input_shape,))

    x = BatchNormalization()(inputs)

    x = Dense(64, activation="relu")(x)
    x = Dropout(0.5)(x)
    
    x = Dense(64, activation="relu")(x)
    x = Dropout(0.5)(x)
    
    outputs = Dense(output_shape, activation="sigmoid")(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["acc"])
    return model

model = create_model(X_train.shape[1], Y_onehot_train.shape[1], {})
model.summary()

callbacks = [EarlyStopping(monitor="acc", min_delta=0.05, patience=2, verbose=2, mode="max")]
model.fit(X_train, Y_onehot_train, epochs=10, batch_size=128, shuffle="batch", callbacks=callbacks)
print(model.metrics_names, model.evaluate(X_test, Y_onehot_test))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 700)               0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 700)               2800      
_________________________________________________________________
dense_1 (Dense)              (None, 64)                44864     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 46)                2990      
Total para

In [7]:
# Save the model to disk so we can load, evaluate, infer and explain independantly

# See https://github.com/h5py/h5py/issues/712
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE" 

with open("models/pancan_gtex.params.json", "w") as f:
    f.write(json.dumps({
        "tumor_normal": tumor_normal_encoder.classes_.tolist(),
        "primary_site": primary_site_encoder.classes_.tolist(),
        "genes": genes,
        "train_indices": train_index.tolist(),
        "test_indices": test_index.tolist()}))

with open("models/pancan_gtex.model.json", "w") as f:
    f.write(model.to_json())

model.save_weights("models/pancan_gtex.weights.h5")