#  Classify Treehouse

Load models trained in other notebooks and see how they do on the Treehouse samples

In [10]:
import os
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
import matplotlib.pyplot as pyplot

# fix random seed for reproducibility
np.random.seed(42)

# See https://github.com/h5py/h5py/issues/712
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE" 

## Load Datasets

In [70]:
%%time
X = pd.read_hdf("data/tcga_target_gtex.h5", "expression")
Y = pd.read_hdf("data/tcga_target_gtex.h5", "labels")

X_treehouse = pd.read_hdf("data/treehouse.h5", "expression")
Y_treehouse = pd.read_hdf("data/treehouse.h5", "labels")

CPU times: user 0 ns, sys: 13.7 s, total: 13.7 s
Wall time: 15.5 s


## Primary Site Classifier

In [31]:
# Load the model
model = keras.models.model_from_json(open("models/primary_site.model.json").read())
model.load_weights("models/primary_site.weights.h5")
params = json.loads(open("models/primary_site.params.json").read())

In [37]:
# Let's run it on the training set just to make sure we haven't lost something...
from sklearn import preprocessing
encoder = preprocessing.LabelBinarizer()
y_onehot = encoder.fit_transform(Y.primary_site.values)

# Prune X to only include genes in the gene sets
X_pruned = X.drop(labels=(set(X.columns) - set(params["genes"])), axis=1, errors="ignore")

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.evaluate(X_pruned, y_onehot)



[0.07240446106504723, 0.9799294959211106]

In [38]:
# Now let's try on Treehouse

# Prune X to only include genes in the gene sets
X_treehouse_pruned = X_treehouse.drop(labels=(set(X.columns) - set(params["genes"])), axis=1, errors="ignore")

In [66]:
Y_treehouse["primary_site_predicted"] = [", ".join(["{}({:0.2f})".format(params["labels"][i], p[i]) 
                                                    for i in p.argsort()[-3:][::-1]]) 
                                         for p in model.predict(X_treehouse_pruned)]
Y_treehouse.primary_site_predicted[0:3]

id
TH01_0051_S01                Kidney(0.17), Lung(0.15), Brain(0.06)
TH01_0053_S01    White blood cell(0.25), Brain(0.16), Kidney(0.15)
TH01_0054_S01       Lung(0.14), Skin(0.14), White blood cell(0.10)
Name: primary_site_predicted, dtype: object

In [67]:
Y_treehouse.to_csv("models/treehouse_predictions.tsv", sep="\t")