# Train and Optimize

Train a neural network on TCGA+GTEX gene expression to classify tissue/disease

In [1]:
import sys
import os
import json
import pandas as pd
import numpy as np
import tensorflow as tf

# fix random seed for reproducibility
np.random.seed(42)

!mkdir -p ~/data/pancan-gtex
os.chdir(os.path.expanduser("~/data/pancan-gtex"))

## Load and Wrangle Data

In [2]:
X = pd.read_hdf(os.path.expanduser("pancan-gtex-hugo.h5"), "samples")
Y = pd.read_hdf(os.path.expanduser("pancan-gtex-hugo.h5"), "labels")
print("Loaded {} samples with {} features and {} labels".format(X.shape[0], X.shape[1], Y.shape[1]))

Loaded 17277 samples with 44792 features and 40 labels


In [3]:
# Check that we still sum to roughly 1M net of dropped transcripts due to no mapping
X.iloc[::X.shape[0]//5].sum(axis="columns")

GTEX-1117F-0226-SM-5GZZ7    1.000000e+06
GTEX-OHPK-0226-SM-3MJH6     1.000000e+06
GTEX-ZXG5-0326-SM-5GICH     1.000001e+06
TCGA-BA-5556-01             1.000000e+06
TCGA-EM-A2OW-01             1.000003e+06
TCGA-ZT-A8OM-01             9.999993e+05
dtype: float32

In [4]:
# Prune X to only KEGG pathway genes
with open("c2.cp.kegg.v6.2.symbols.gmt") as f:
    subset_of_genes = list(set().union(*[line.strip().split("\t")[2:] for line in f.readlines()]))
print("Pruning to only include KEGG pathway genes")

# # Prune X to only Cosmic Cancer Genes
# print("Pruning to only COSMIC genes")
# subset_of_genes = pd.read_table("cosmic-26-11-2018.tsv")["Gene Symbol"].values
    
pruned_X = X.drop(labels=(set(X.columns) - set(subset_of_genes)), axis=1)

# Order must match dataframe so we can use this as labels for shap
genes = list(pruned_X.columns.values)
print("Pruned expression to only include", len(genes), "genes")

# Create a multi-label one-hot for tumor/normal and primary site
from sklearn.preprocessing import LabelEncoder

primary_site_encoder = LabelEncoder()
Y["primary_site_value"] = pd.Series(
    primary_site_encoder.fit_transform(Y["_primary_site"]), index=Y.index, dtype='int32')

tumor_normal_encoder = LabelEncoder()
Y["tumor_normal_value"] = pd.Series(
    tumor_normal_encoder.fit_transform(Y["tumor_normal"]), index=Y.index, dtype='int32')

Y_onehot = np.append(
    Y["tumor_normal_value"].values.reshape(Y.shape[0],-1), 
    tf.keras.utils.to_categorical(Y["primary_site_value"]), axis=1)

Pruning to only include KEGG pathway genes
Pruned expression to only include 5169 genes


In [5]:
# See what the TPM levels remaining are after the pruning
pruned_X.iloc[::pruned_X.shape[0]//5].sum(axis="columns")

GTEX-1117F-0226-SM-5GZZ7    437661.5000
GTEX-OHPK-0226-SM-3MJH6     354934.9375
GTEX-ZXG5-0326-SM-5GICH     328366.5000
TCGA-BA-5556-01             418472.6250
TCGA-EM-A2OW-01             301706.4375
TCGA-ZT-A8OM-01             590327.1250
dtype: float32

In [6]:
pruned_X.loc["TCGA-ZT-A8OM-01"].sum()

590327.1

In [7]:
# Split into training and test sets strattified on primary site
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(X.values, Y.primary_site_value):
    X_train = pruned_X.values[train_index]
    X_test = pruned_X.values[test_index]
    Y_train = Y.iloc[train_index]
    Y_test = Y.iloc[test_index]
    Y_onehot_train = Y_onehot[train_index]
    Y_onehot_test = Y_onehot[test_index]
    
print("Train:", X_train.shape, "Test:", X_test.shape)

Train: (13821, 5169) Test: (3456, 5169)


In [8]:
import matplotlib.pyplot as plt

# Lets see how big each class is based on primary site
plt.hist(Y_train.primary_site_value.values, alpha=0.5, label='Train')
plt.hist(Y_test.primary_site_value.values, alpha=0.5, label='Test')
plt.legend(loc='upper right')
plt.title("Primary site distribution between train and test sets")
plt.show()

# Lets see how big each class is based tumor/normal
plt.hist(Y_train.tumor_normal_value.values, alpha=0.5, label='Train')
plt.hist(Y_test.tumor_normal_value.values, alpha=0.5, label='Test')
plt.legend(loc='upper right')
plt.title("Tumor/normal distribution between train and test sets")
plt.show()

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

## Build and Train Model

In [9]:
def create_model(input_shape, output_shape, params):
    inputs = tf.keras.layers.Input(shape=(input_shape,))

    x = tf.keras.layers.BatchNormalization()(inputs)

    x = tf.keras.layers.Dense(64, activation="relu")(x)
    x = tf.keras.layers.Dropout(0.5)(x)
    
    x = tf.keras.layers.Dense(64, activation="relu")(x)
    x = tf.keras.layers.Dropout(0.5)(x)
    
    outputs = tf.keras.layers.Dense(output_shape, activation="sigmoid")(x)
    
    model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["acc"])
    return model

model = create_model(X_train.shape[1], Y_onehot_train.shape[1], {})
model.summary()

callbacks = [tf.keras.callbacks.EarlyStopping(monitor="acc", min_delta=0.05, patience=2, verbose=2, mode="max")]
model.fit(X_train, Y_onehot_train, epochs=10, batch_size=128, shuffle="batch", callbacks=callbacks)
print(model.metrics_names, model.evaluate(X_test, Y_onehot_test))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 5169)              0         
_________________________________________________________________
batch_normalization (BatchNo (None, 5169)              20676     
_________________________________________________________________
dense (Dense)                (None, 64)                330880    
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 34)                2210      
Total para

In [10]:
# Save the model to disk so we can load, evaluate, infer and explain independantly
!mkdir -p ~/data/pancan-gtex/models

# See https://github.com/h5py/h5py/issues/712
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"

with open("models/pancan-gtex-hugo-params.json", "w") as f:
    f.write(json.dumps({
        "tumor_normal": tumor_normal_encoder.classes_.tolist(),
        "primary_site": primary_site_encoder.classes_.tolist(),
        "genes": genes,
        "train_indices": train_index.tolist(),
        "test_indices": test_index.tolist()}))

with open("models/pancan-gtex-hugo-model.json", "w") as f:
    f.write(model.to_json())

model.save_weights("models/pancan-gtex-hugo-weights.h5")