<a href="https://colab.research.google.com/github/rcurrie/pancan-gtex/blob/master/infer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Infer
Infer primary site and tumor vs. normal from kallisto expression data using a trained neural network. Explain the classification via the SHAP package

In [0]:
import sys
import os
import json
import requests
import numpy as np
import pandas as pd
!pip install -q tables
import tensorflow as tf
from IPython.core.display import display, HTML

# Switch to a scratch data directory so all paths are local
os.makedirs(os.path.expanduser("~/data/pancan-gtex"), exist_ok=True)
os.chdir(os.path.expanduser("~/data/pancan-gtex"))

In [82]:
# Setup S3 connection to download the training set and trained model
import boto3
from botocore.handlers import disable_signing

bucket_name = "stuartlab"
endpoint = "s3.nautilus.optiputer.net"

# Set so that Tensorflow can pull from the PRP S3/CEPH storage cluster
os.environ["S3_ENDPOINT"] = endpoint

session = boto3.session.Session()
resource = boto3.resource("s3", endpoint_url="https://{}".format(endpoint))
resource.meta.client.meta.events.register('choose-signer.s3.*', disable_signing)
bucket = resource.Bucket(bucket_name)

# Output the data and checksums for the dataset and trained model
print("Dataset:")
for obj in bucket.objects.filter(Prefix="pancan-gtex"):
    print(obj.last_modified.isoformat(), obj.e_tag[1:-1], obj.key) 
    
print("Trained Model:") 
for obj in bucket.objects.filter(Prefix="rcurrie/pancan-gtex/models"):
    print(obj.last_modified.isoformat(), obj.e_tag[1:-1], obj.key) 

Dataset:
2018-11-23T17:26:45.350000+00:00 ef1c9c15b5a1fd836521f0759e57dfa7-120 pancan-gtex
2019-01-18T22:57:17.416000+00:00 db47628dee79f558ad403c0a6cfe163d-2 pancan-gtex/ensemble-to-hugo.tsv
2019-01-18T22:57:13.190000+00:00 2b90ca5a5ea09a6353865afdd76a0ade-64 pancan-gtex/pancan-gtex.h5
Trained Model:
2019-01-19T20:47:25.400000+00:00 411ee9f1b92a4a1fc057994a8afc44cc-2 rcurrie/pancan-gtex/models/model-debug.h5
2019-01-19T20:25:19.705000+00:00 37eab74d9c22969a13fcc93c6a347247-2 rcurrie/pancan-gtex/models/model.h5
2019-01-19T20:47:23.954000+00:00 345e77a0364204eed637aa58aa61c763 rcurrie/pancan-gtex/models/params-debug.json
2019-01-19T20:25:18.273000+00:00 dec23a80ae61ce3a1673d1fb0b22eb7b rcurrie/pancan-gtex/models/params.json


In [27]:
# Load background samples and ensemble to hugo table for explanation
if not os.path.exists("pancan-gtex.h5"):
    print("Downloading backgound...")
    r = requests.get("https://s3.nautilus.optiputer.net/stuartlab/pancan-gtex/pancan-gtex.h5")
    open("pancan-gtex.h5", "wb").write(r.content)
    
X = pd.read_hdf("pancan-gtex.h5", "samples")
Y = pd.read_hdf("pancan-gtex.h5", "labels")
print("Loaded {} samples with {} features and {} labels".format(X.shape[0], X.shape[1], Y.shape[1]))

    
if not os.path.exists("ensemble-to-hugo.tsv"):
    r = requests.get("https://s3.nautilus.optiputer.net/stuartlab/pancan-gtex/ensemble-to-hugo.tsv")
    open("ensemble-to-hugo.tsv", "wb").write(r.content)
    
ensemble_to_hugo = pd.read_table("ensemble-to-hugo.tsv", index_col=0)

Loaded 17277 samples with 7564 features and 40 labels


In [28]:
# Load params and trained model from S3
r = requests.get("https://s3.nautilus.optiputer.net/stuartlab/rcurrie/pancan-gtex/models/params.json")
params = r.json()

r = requests.get("https://s3.nautilus.optiputer.net/stuartlab/rcurrie/pancan-gtex/models/model.h5")
import tempfile
temp_path = "/tmp/{}.h5".format(next(tempfile._get_candidate_names()))
open(temp_path, "wb").write(r.content)

model = tf.keras.models.load_model(temp_path)
os.remove(temp_path)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 7564)              0         
_________________________________________________________________
batch_normalization (BatchNo (None, 7564)              30256     
_________________________________________________________________
dense (Dense)                (None, 128)               968320    
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 34)                4386      
Total para

In [0]:
!rm abundance.tsv

In [104]:
from google.colab import files

uploaded = files.upload()

Saving abundance.tsv to abundance.tsv


In [101]:
!ls

abundance.tsv  ensemble-to-hugo.tsv  pancan-gtex.h5


In [105]:
# Load the sample's expression kallisto TPM output
sample = pd.read_table(list(uploaded.keys())[0], index_col=0, engine='c').astype(np.float32).T.loc[["tpm"]]
assert int(sample.iloc[0].sum()) == 1000000
sample.head()

target_id,ENST00000619216.1,ENST00000473358.1,ENST00000469289.1,ENST00000607096.1,ENST00000417324.1,ENST00000461467.1,ENST00000335137.3,ENST00000466430.5,ENST00000495576.1,ENST00000477740.5,...,ENST00000306641.1,ENST00000612245.1,ENST00000361963.3,ENST00000306609.4,ENST00000516617.1,ENST00000417334.1,ENST00000516816.1,ENST00000515987.1,ENST00000517139.1,ENST00000620883.1
tpm,26.0229,0.0,0.0,0.0,0.0,0.0,0.0,2.07142,0.0,0.0,...,1.19823,0.507383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
prediction = model.predict(sample.filter(X.columns, axis="columns").sort_index(axis="columns"))[0]

In [0]:
# To predict any of the training samples:
# prediction = model.predict(X.loc[["TCGA-ZS-A9CE-01"]])[0]

In [107]:
tumor_normal_prediction_index = int(round(prediction[0]))
tumor_normal_prediction_value = prediction[0]
tumor_normal_prediction_label = params["tumor_normal"][tumor_normal_prediction_index]
display(HTML("<h3>{} with {:.2f} confidence</h3>".format(
    tumor_normal_prediction_label, tumor_normal_prediction_value)))

primary_site_prediction_index = np.argmax(prediction[1:-1])
primary_site_prediction_value = prediction[1:-1][primary_site_prediction_index]
primary_site_prediction_label = params["primary_site"][primary_site_prediction_index]
display(HTML("<h3>{} with {:.2f} confidence".format(
    primary_site_prediction_label, primary_site_prediction_value)))