In [2]:
from stellar.layer.hinsage import Hinsage
import keras
from keras import backend as K
import numpy as np
import pandas as pd
from typing import List

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
# Read edge data
edge_data = pd.read_csv(
    '/home/ubuntu/genes/max_cc_int_num_indices.txt', 
    delim_whitespace=True
).drop(['score', 'data_source'], axis=1)

# Read attribute data
gene_attr = pd.read_csv(
    '/home/ubuntu/genes/maxcc_node_attr_cleaned_num_indices.txt', 
    index_col=1
).drop(['Unnamed: 0'], axis=1)

# List of IDs
rnd = np.random.uniform(0, 1, len(gene_attr))
ids_train = gene_attr.loc[rnd <= 0.5].index.values
ids_val = gene_attr.loc[(rnd > 0.5) & (rnd < 0.7)].index.values
ids_test = gene_attr.loc[rnd >= 0.7].index.values

# Features
feats = gene_attr.drop(['node_type'], axis=1)
def get_feats(indices: List[int]):
    return feats.loc[indices].fillna(0).as_matrix()

# Labels
labels = gene_attr['node_type'].map(lambda x: x == 'alz')
def get_labels(indices: List[int]):
    return np.array(labels[indices], dtype=np.float64)

# Adjacency lists of each edge type
adj_coex = edge_data.loc[
    edge_data['interaction_type']=='coexpression'
].groupby(['ensg.A'])['ensg.B'].apply(list)
adj_ppi = edge_data.loc[
    edge_data['interaction_type']=='PPI'
].groupby(['ensg.A'])['ensg.B'].apply(list)
adj_epis = edge_data.loc[
    edge_data['interaction_type']=='epistasis'
].groupby(['ensg.A'])['ensg.B'].apply(list)
def sample_neighs(indices: List[int], ns: int):
    def with_adj(adj_curr):
        return [[-1]*ns if (not isinstance(adj, list)) and pd.isnull(adj) 
                else [adj[i] for i in np.random.randint(len(adj), size=ns)]
                for adj in adj_ppi.loc[indices].values]
    return (with_adj(adj_coex), with_adj(adj_ppi), with_adj(adj_epis))

# Get batch, given root node IDs and number of samples per "hop"
def get_batch(indices: List[int], ns: List[int]):
    nb = len(indices)
    flatten = lambda l: [item for sublist in l for item in sublist]
    coex, ppi, epis = sample_neighs(indices, ns[0])
    coex_1 = sample_neighs(flatten(coex), ns[1])
    ppi_1 = sample_neighs(flatten(ppi), ns[1])
    epis_1 = sample_neighs(flatten(ppi), ns[1])
    return (get_labels(indices),
            [get_feats(flatten(inds)).reshape([nb,-1,414]) 
             for inds in [[indices], coex, ppi, epis, *coex_1, *ppi_1, *epis_1]])

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  app.launch_new_instance()


In [21]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, ids, nf, ns, batch_size=1000, name='train'):
        'Initialization'
        self.batch_size = batch_size
        self.ids = ids
        self.data_size = len(ids)
        self.nf = nf
        self.ns = ns
        self.idx = 0
        self.name = name
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.ceil(self.data_size/self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        if (self.idx >= self.data_size):
            print("this shouldn't happen, but it does")
            self.on_epoch_end()
        end = min(self.idx + self.batch_size, self.data_size)
#         print("Fetching {} batch {}:{}".format(self.name, str(self.idx), str(end)))
        indices = list(self.ids[range(self.idx, end)])
        tgt, inp = get_batch(indices, self.ns)
        self.idx = end

        return inp, tgt

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.idx = 0
#         print('end of {} epoch!'.format(self.name))
        np.random.shuffle(self.ids)

In [56]:
nf = 414
n_samples = [5, 5]

def n_at(i):
    return np.product(n_samples[:i])

def create_weighted_binary_crossentropy(zero_weight, one_weight):
    def weighted_binary_crossentropy(y_true, y_pred):
        b_ce = K.binary_crossentropy(y_true, y_pred)
        weight_vector = y_true * one_weight + (1. - y_true) * zero_weight
        weighted_b_ce = weight_vector * b_ce
        return K.mean(weighted_b_ce)

    return weighted_binary_crossentropy

hs = Hinsage(
    output_dims=[256, 256],
    n_samples=n_samples,
    input_neigh_tree=[('gene', [1, 2, 3]), 
                      ('gene', [4, 5, 6]), ('gene', [7, 8, 9]), ('gene', [10, 11, 12]), 
                      ('gene', []), ('gene', []), ('gene', []), 
                      ('gene', []), ('gene', []), ('gene', []),
                      ('gene', []), ('gene', []), ('gene', [])],
    input_dim={'gene': nf}
)

x_inp = [
    keras.Input(shape=(1, nf)),
    keras.Input(shape=(n_at(1), nf)),
    keras.Input(shape=(n_at(1), nf)),
    keras.Input(shape=(n_at(1), nf)),
    keras.Input(shape=(n_at(2), nf)),
    keras.Input(shape=(n_at(2), nf)),
    keras.Input(shape=(n_at(2), nf)),
    keras.Input(shape=(n_at(2), nf)),
    keras.Input(shape=(n_at(2), nf)),
    keras.Input(shape=(n_at(2), nf)),
    keras.Input(shape=(n_at(2), nf)),
    keras.Input(shape=(n_at(2), nf)),
    keras.Input(shape=(n_at(2), nf))
]

x_out = keras.layers.Reshape((256,))(hs(x_inp))
pred = keras.layers.Activation('sigmoid')(keras.layers.Dense(1)(x_out))

model = keras.Model(inputs=x_inp, outputs=pred)
model.compile(
    optimizer=keras.optimizers.Adam(lr=0.01), 
    loss=create_weighted_binary_crossentropy(0.6, 9), 
    metrics=['accuracy']
)

In [57]:
batch_iter = DataGenerator(ids_train, nf, n_samples)
# batch_iter_val = DataGenerator(ids_val, nf, n_samples, name='val')
# batch_iter_test = DataGenerator(ids_test, nf, n_samples, name='test')
model.fit_generator(batch_iter, epochs=10, verbose=2)
# model.evaluate_generator(batch_iter_test)

Epoch 1/10


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


 - 31s - loss: 0.6818 - acc: 0.8126
Epoch 2/10
 - 31s - loss: 0.6718 - acc: 0.7988
Epoch 3/10
 - 31s - loss: 0.6675 - acc: 0.8420
Epoch 4/10
 - 30s - loss: 0.6621 - acc: 0.7535
Epoch 5/10
 - 31s - loss: 0.6601 - acc: 0.8208
Epoch 6/10
 - 31s - loss: 0.6580 - acc: 0.7367
Epoch 7/10
 - 31s - loss: 0.6564 - acc: 0.7871
Epoch 8/10
 - 31s - loss: 0.6531 - acc: 0.7893
Epoch 9/10
 - 30s - loss: 0.6509 - acc: 0.7709
Epoch 10/10
 - 31s - loss: 0.6516 - acc: 0.7954


<keras.callbacks.History at 0x7fd13c0a3240>

In [58]:
class TestDataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, ids, nf, ns, batch_size=1000):
        'Initialization'
        self.batch_size = batch_size
        self.ids = ids
        self.data_size = len(ids)
        self.nf = nf
        self.ns = ns
        self.idx = 0
        self.y_true = []
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.ceil(self.data_size/self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        end = min(self.idx + self.batch_size, self.data_size)
        indices = list(self.ids[range(self.idx, end)])
        tgt, inp = get_batch(indices, self.ns)
        self.y_true += [tgt]
        self.idx = end

        return inp

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.idx = 0

In [59]:
test_iter = TestDataGenerator(ids_test, nf, n_samples)
y_preds = model.predict_generator(test_iter)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


In [60]:
y_trues_bin = np.concatenate(test_iter.y_true).ravel()[:len(ids_test)]
y_preds_bin = np.array(np.reshape(y_preds, (-1,)) >= 0.5, dtype=np.float64)

In [63]:
met = lambda f,k: sum([f(y_trues_bin[i], y_preds_bin[i]) and y_preds_bin[i] == k for i in range(len(ids_test))])
confu_mat = {'tn': met(lambda t, p: t == p, 0), 'fp': met(lambda t, p: t != p, 1),
             'fn': met(lambda t, p: t != p, 0), 'tp': met(lambda t, p: t == p, 1)}

In [64]:
confu_mat

{'fn': 280, 'fp': 2404, 'tn': 6591, 'tp': 194}