#  Classify Treehouse

Load models trained in other notebooks and see how they do on the Treehouse samples

In [1]:
import os
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
import matplotlib.pyplot as pyplot

# fix random seed for reproducibility
np.random.seed(42)

# See https://github.com/h5py/h5py/issues/712
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE" 

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Load Datasets

In [70]:
%%time
X = pd.read_hdf("data/tcga_target_gtex.h5", "expression")
Y = pd.read_hdf("data/tcga_target_gtex.h5", "labels")

X_treehouse = pd.read_hdf("data/treehouse.h5", "expression")
Y_treehouse = pd.read_hdf("data/treehouse.h5", "labels")

CPU times: user 0 ns, sys: 13.7 s, total: 13.7 s
Wall time: 15.5 s


## Primary Site Classifier

In [31]:
# Load the model
model = keras.models.model_from_json(open("models/primary_site.model.json").read())
model.load_weights("models/primary_site.weights.h5")
params = json.loads(open("models/primary_site.params.json").read())

In [37]:
# Let's run it on the training set just to make sure we haven't lost something...
from sklearn import preprocessing
encoder = preprocessing.LabelBinarizer()
y_onehot = encoder.fit_transform(Y.primary_site.values)

# Prune X to only include genes in the gene sets
X_pruned = X.drop(labels=(set(X.columns) - set(params["genes"])), axis=1, errors="ignore")

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.evaluate(X_pruned, y_onehot)



[0.07240446106504723, 0.9799294959211106]

In [38]:
# Now let's try on Treehouse

# Prune X to only include genes in the gene sets
X_treehouse_pruned = X_treehouse.drop(labels=(set(X.columns) - set(params["genes"])), axis=1, errors="ignore")

In [66]:
Y_treehouse["primary_site_predicted"] = [", ".join(["{}({:0.2f})".format(params["labels"][i], p[i]) 
                                                    for i in p.argsort()[-3:][::-1]]) 
                                         for p in model.predict(X_treehouse_pruned)]
Y_treehouse.primary_site_predicted[0:3]

id
TH01_0051_S01                Kidney(0.17), Lung(0.15), Brain(0.06)
TH01_0053_S01    White blood cell(0.25), Brain(0.16), Kidney(0.15)
TH01_0054_S01       Lung(0.14), Skin(0.14), White blood cell(0.10)
Name: primary_site_predicted, dtype: object

In [67]:
Y_treehouse.to_csv("models/treehouse_predictions.tsv", sep="\t")

## Treehouse Pathways
Load predictions from pathway model, enrich with pathways and disease from tertiary protocol and analyze

In [5]:
Y = pd.read_csv("models/Y_treehouse_predictions.tsv", sep="\t", )
Y.head()

Unnamed: 0,id,age_in_years,gender,disease,predicted_tumor_normal,predicted_primary_site,predicted_disease,predicted_pathways
0,TH01_0051_S01,,Not Reported,Hepatoblastoma,Tumor (0.77),"Kidney (0.17), Liver (0.10), Lung (0.05)","Kidney Clear Cell Carcinoma (0.12), Liver Hepa...","KEGG_ABC_TRANSPORTERS (2.85), KEGG_LEUKOCYTE_T..."
1,TH01_0053_S01,,Not Reported,Acute Myeloid Leukemia,Tumor (0.68),"White blood cell (0.29), Blood (0.22), Lung (0...","Acute Myeloid Leukemia (0.22), Whole Blood (0....","KEGG_CELL_ADHESION_MOLECULES_CAMS (3.85), KEGG..."
2,TH01_0054_S01,,Not Reported,Acute Lymphoblastic Leukemia,Tumor (0.59),"White blood cell (0.29), Blood (0.25), Stomach...","Acute Myeloid Leukemia (0.20), Whole Blood (0....","KEGG_FC_GAMMA_R_MEDIATED_PHAGOCYTOSIS (3.61), ..."
3,TH01_0055_S01,,Not Reported,Glioma,Tumor (0.59),"Brain (0.64), Kidney (0.05), Esophagus (0.05)","Brain Lower Grade Glioma (0.21), Head & Neck S...","KEGG_CALCIUM_SIGNALING_PATHWAY (4.15), KEGG_GL..."
4,TH01_0061_S01,,Not Reported,Germ Cell Tumor,Tumor (0.77),"Kidney (0.14), Lung (0.09), Skin (0.07)","Kidney Clear Cell Carcinoma (0.08), Lung Squam...",KEGG_BIOSYNTHESIS_OF_UNSATURATED_FATTY_ACIDS (...


In [67]:
import glob
import json


id = "TH01_0051_S01"



conf_path = glob.glob(
        "/treehouse/archive/downstream/{}/tertiary/treehouse-protocol*/compendium*/conf.json".format(y.id))


    
#     if conf_path:
#         with open(conf_path[0]) as f:
#             conf = json.loads(f.read())
            
#             if "disease" in conf["info"]:
#                 print(conf["info"]["disease"])



In [63]:
clinical.head()



Unnamed: 0,id,Disease,Dataset,Gender,Anatomical_location,Sample_type,Stage_all_cancers,Grade_all_cancers,Age_at_dx,Ped_AYA,Histology_all_cancers,Subcategory,Race,Ethnicity
0,TH01_0053_S01,acute lymphoblastic leukemia,TH,,not noted,,,,,yes,,,,
1,TH01_0054_S01,lymphoblastic leukemia,TH,,not noted,,,,,yes,b-cell,"T1, relapse",,
2,TH01_0055_S01,glioma,TH,,not noted,,,unknown,,yes,astrocytoma,,,
3,TH01_0061_S01,germ cell tumor,TH,,not noted,,,,,yes,mixed,,,
4,TH01_0062_S01,acute lymphoblastic leukemia,TH,,not noted,,,,,yes,,,,


In [30]:
conf

{'cohort_pathsafe_name': 'v4',
 'dir': {'base': '/data',
  'cohort': '/data/references/compendium/v4',
  'cohort_clinical': '/data/references/compendium/v4/clinical',
  'cohortbase': '/data/references/compendium',
  'gene_expression_plots_dir': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/expression_plots',
  'ref': '/data/references/external',
  'sample': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01',
  'samplebase': '/data/notebooks/e-t-k/protocol_batches/thops74/output'},
 'file': {'5_out': {'genes_pc_up': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/genes_TH01_0051_S01_pc_up',
   'genes_pd_up': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/genes_TH01_0051_S01_pd_up'},
  '7_out': {'all_gene_aggregation': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/allGeneAggregation.txt',
   'druggable_gene_aggregation': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0