# Feature Importance

Determine which features contribute the most to the output and see how well this coorelates with the COSMIC cancer gene census:

http://cancer.sanger.ac.uk/census/

Resources

https://en.wikipedia.org/wiki/Feature_selection#Wrapper_method

https://stats.stackexchange.com/questions/250381/feature-selection-using-deep-learning

http://blog.datadive.net/selecting-good-features-part-i-univariate-selection/

https://arxiv.org/abs/1704.02685


In [7]:
import numpy as np
import pandas as pd

In [1]:
# Load the model from disk as trained on the GPU box
from keras.models import model_from_json

print("Loading model...")
with open("models/model.json", "r") as f:
    model = model_from_json(f.read())
print("Loading weights...")
model.load_weights("models/weights.h5")
print("Compliling model...")
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Using TensorFlow backend.


Loading model...
Loading weights...
Compliling model...


In [4]:
# Check the accuracy using the test set
from keras.utils.io_utils import HDF5Matrix
import h5py

input_file = "data/tumor_normal.h5"
X_test = HDF5Matrix(input_file, "X_test")
y_test = HDF5Matrix(input_file, "y_test")

print(model.metrics_names, model.evaluate(X_test, y_test))

['loss', 'acc'] [0.5051567359446264, 0.92263460533193942]


In [5]:
# Load the features into a dataframe
import h5py
with h5py.File("data/tumor_normal.h5", "r") as f:
    genes = f["genes"][:]

In [8]:
"""
First do the simplest thing - sum the weights at the first layer by gene, sort,
and see if those with the highest weights intersect with the COSMIC list

Maybe it should be the sum of the absolute value? Negative or positive weight implies affect...
"""
weights = model.layers[2].get_weights()
ranks = np.absolute(np.sum(weights[0], axis=1))
rankings = pd.DataFrame(ranks, index=genes.astype('U')).sort_values(by=0, ascending=False)
rankings.head()

Unnamed: 0,0
CTC-305H11,23.287769
RP11-78A18,20.994854
U3,19.101227
RP11-297L1,18.811268
LINC00363,18.586199


In [9]:
rankings.index.values

array(['CTC-305H11', 'RP11-78A18', 'U3', ..., 'MIR6888', 'C12orf79',
       'MIR4745'], dtype=object)

In [10]:
cosmic = pd.read_table("cancer_genes.tsv")["Gene Symbol"].values

In [15]:
np.intersect1d(rankings.index.values[0:10000], cosmic).shape

(61,)

In [12]:
# Try DeepLIFT
!pip install git+https://github.com/kundajelab/deeplift.git

Collecting git+https://github.com/kundajelab/deeplift.git
  Cloning https://github.com/kundajelab/deeplift.git to /tmp/pip-bzq8yc3w-build
Installing collected packages: deeplift
  Running setup.py install for deeplift ... [?25ldone
[?25hSuccessfully installed deeplift-0.5.1-theano


In [16]:
import deeplift
from deeplift.conversion import keras_conversion as kc

deeplift_model = kc.convert_sequential_model(
                    model,
                    nonlinear_mxts_mode=deeplift.blobs.NonlinearMxtsMode.DeepLIFT_GenomicsDefault)

find_scores_layer_idx = 0

deeplift_contribs_func = deeplift_model.get_target_contribs_func(
                            find_scores_layer_idx=find_scores_layer_idx,
                            target_layer_idx=-1)

scores = np.array(deeplift_contribs_func(task_idx=0,
                                         input_data_list=[X_train],
                                         batch_size=10,
                                         progress_update=1000))

nonlinear_mxts_mode is set to: DeepLIFT_GenomicsDefault


KeyError: 'inputlayer'

In [20]:
model.layers[0].name

'input_1'