<a href="https://colab.research.google.com/github/rcurrie/pancan-gtex/blob/master/infer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import sys
import os
import json
import requests
import numpy as np
import pandas as pd
!pip install -q tables
import tensorflow as tf
from IPython.core.display import display, HTML

# Switch to a scratch data directory so all paths are local
os.makedirs(os.path.expanduser("~/data/pancan-gtex"), exist_ok=True)
os.chdir(os.path.expanduser("~/data/pancan-gtex"))

In [25]:
# Load background samples and ensemble to hugo table for explanation
if not os.path.exists("pancan-gtex.h5"):
    print("Downloading backgound...")
    r = requests.get("https://s3.nautilus.optiputer.net/stuartlab/pancan-gtex/pancan-gtex.h5")
    open("pancan-gtex.h5", "wb").write(r.content)
    
X = pd.read_hdf("pancan-gtex.h5", "samples")
Y = pd.read_hdf("pancan-gtex.h5", "labels")
print("Loaded {} samples with {} features and {} labels".format(X.shape[0], X.shape[1], Y.shape[1]))

    
if not os.path.exists("ensemble-to-hugo.tsv"):
    r = requests.get("https://s3.nautilus.optiputer.net/stuartlab/pancan-gtex/ensemble-to-hugo.tsv")
    open("ensemble-to-hugo.tsv", "wb").write(r.content)
    
ensemble_to_hugo = pd.read_table("ensemble-to-hugo.tsv", index_col=0)

Loaded 17277 samples with 8160 features and 40 labels


In [51]:
# Load params and trained model from S3
r = requests.get("https://s3.nautilus.optiputer.net/stuartlab/rcurrie/pancan-gtex/models/params.json")
params = r.json()

r = requests.get("https://s3.nautilus.optiputer.net/stuartlab/rcurrie/pancan-gtex/models/model.h5")
import tempfile
temp_path = "/tmp/{}.h5".format(next(tempfile._get_candidate_names()))
open(temp_path, "wb").write(r.content)

model = tf.keras.models.load_model(temp_path)
os.remove(temp_path)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_187 (InputLayer)       (None, 8160)              0         
_________________________________________________________________
batch_normalization_186 (Bat (None, 8160)              32640     
_________________________________________________________________
dense_708 (Dense)            (None, 128)               1044608   
_________________________________________________________________
dropout_522 (Dropout)        (None, 128)               0         
_________________________________________________________________
dense_709 (Dense)            (None, 128)               16512     
_________________________________________________________________
dropout_523 (Dropout)        (None, 128)               0         
_________________________________________________________________
dense_710 (Dense)            (None, 34)                4386      
Total para

In [5]:
from google.colab import files

uploaded = files.upload()

Saving abundance.tsv to abundance.tsv


In [15]:
# Load the sample's expression kallisto TPM output
sample = pd.read_table(list(uploaded.keys())[0], index_col=0, engine='c').astype(np.float32).T.loc[["tpm"]]
assert int(sample.iloc[0].sum()) == 1000000
sample.head()

target_id,ENST00000619216.1,ENST00000473358.1,ENST00000469289.1,ENST00000607096.1,ENST00000417324.1,ENST00000461467.1,ENST00000335137.3,ENST00000466430.5,ENST00000495576.1,ENST00000477740.5,...,ENST00000306641.1,ENST00000612245.1,ENST00000361963.3,ENST00000306609.4,ENST00000516617.1,ENST00000417334.1,ENST00000516816.1,ENST00000515987.1,ENST00000517139.1,ENST00000620883.1
tpm,13.5906,0.042316,0.0,0.0,0.0,1.49707,0.0,1.25308,0.235824,0.0,...,0.924945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
prediction = model.predict(sample.filter(X.columns, axis="columns"))[0]

In [53]:
tumor_normal_prediction_index = int(round(prediction[0]))
tumor_normal_prediction_value = prediction[0]
tumor_normal_prediction_label = params["tumor_normal"][tumor_normal_prediction_index]
display(HTML("<h3>{} with {:.2f} confidence</h3>".format(
    tumor_normal_prediction_label, tumor_normal_prediction_value)))

primary_site_prediction_index = np.argmax(prediction[1:-1])
primary_site_prediction_value = prediction[1:-1][primary_site_prediction_index]
primary_site_prediction_label = params["primary_site"][primary_site_prediction_index]
display(HTML("<h3>{} with {:.2f} confidence".format(
    primary_site_prediction_label, primary_site_prediction_value)))


In [50]:
prediction

array([9.4832599e-01, 2.4207429e-06, 1.8166145e-05, 1.9153147e-03,
       4.8664391e-05, 4.3676779e-04, 6.0575334e-05, 1.7884467e-02,
       1.9926882e-04, 1.3954876e-04, 3.9501395e-03, 3.1814292e-07,
       5.3676138e-07, 4.7649330e-05, 1.3659683e-04, 6.6956236e-05,
       2.6884291e-03, 2.6197486e-05, 1.9801676e-06, 4.1784355e-05,
       2.0032907e-05, 2.7601011e-06, 2.2110816e-04, 7.9341334e-07,
       6.3489555e-05, 1.9427577e-04, 5.5513254e-08, 1.1718812e-04,
       1.6861080e-04, 3.3514602e-03, 3.5417813e-04, 3.9611191e-06,
       3.1357940e-04, 1.1154541e-03], dtype=float32)