# Build Network
With NetSim, I have a tool that is capable of simulating a network. with implanted subnetworks.
Now, I want to use it already generated networks to build the input for training datafor a GCN.

This notebook will take a network and some RNA-seq counts to form a data set that can serve as input to a GCN model.
What it does:
1. Loading network, RNA-seq counts & differential expression
2. Using the insert positions from the network to assign the insert nodes with the most differentially expressed feature vectors
3. Form a h5py file that can serve as input for the GCN

In [12]:
import pandas as pd
import networkx as nx
import numpy as np
import h5py

In [13]:
# params
BALANCE = False

## Loading the Data

In [14]:
# read the network and insert positions
network = nx.read_edgelist('../data/simulation/network.edgelist')
insert_positions = []
with open('../data/simulation/implant_positions.txt', 'r') as f:
    for line in f.readlines():
        if line.startswith('#'): # comment
            pass
        elif line.startswith('Subnetwork'):
            positions = line.split(':')[1].strip().split('\t')
            insert_positions.append([int(i) for i in positions])

# read differential expression (to get log2FoldChange as ranking)
de = pd.DataFrame.from_csv('../data/differential_expression/deseq2_gfppT16_vs_ControlT16.csv')
de.dropna(axis=0, inplace=True)

# read features (RNA-seq counts) & train/test masks
data_file = '../data/preprocessing/legionella_gcn_input.h5'
with h5py.File(data_file, 'r') as f:
    features = f['features'][:]
    node_names = f['gene_names'][:]
    y_train = f['y_train'][:]
    y_test = f['y_test'][:]
    if 'y_val' in f:
        y_val = f['y_val'][:]
    else:
        y_val = None
    train_mask = f['mask_train'][:]
    test_mask = f['mask_test'][:]
    if 'mask_val' in f:
        val_mask = f['mask_val'][:]
    else:
        val_mask = None
features_df = pd.DataFrame(features, index=node_names[:, 0])
# build label column for features
features_df['label'] = y_train[:, 0] | y_test[:, 0]
assert (features_df.label.sum() == len(insert_positions))

## Assign Real Features to Simulated Nodes

In [15]:
# assign disease genes to first nodes of subnetworks
features_df['node'] = np.nan
first_nodes = [i[0] for i in insert_positions]
features_df.loc[features_df.label == 1, 'node'] = first_nodes
# for each of the other nodes, assign the gene with highest differential expression
# first, remove disease genes from DE
labels = features_df[features_df.label == 1].index
diff_expr = de[de.index.isin(features_df.index) & ~de.index.isin(labels)]
sorted_de = diff_expr.sort_values(by='log2FoldChange', ascending=False)

# Now, use the DE log2FoldChange to find genes for neighbors of disease genes
idx = 0
assigned_nodes = list(features_df[~features_df.node.isnull()].node.values)
assert (len(assigned_nodes) == len(set(assigned_nodes))) # no duplicates here
for node_position in range(1, len(insert_positions[0])):
    for subnet in range(len(insert_positions)):
        ens_id = sorted_de.iloc[[idx]].index
        features_df.loc[ens_id, 'node'] = insert_positions[subnet][node_position]
        assigned_nodes.append(insert_positions[subnet][node_position])
        idx += 1
assert (len(assigned_nodes) == len(set(assigned_nodes)))

In [16]:
# Now, randomly assign features to the other nodes
features_not_assigned = features_df[features_df.node.isnull()]
already_assigned = np.array(insert_positions).flatten()
node_assignments = np.random.choice(list(network.nodes()),
                                    size=nx.number_of_nodes(network),
                                    replace=False)
node_assignments = [int(i) for i in node_assignments if not int(i) in already_assigned]
to_assign = features_df[features_df.node.isnull()].sample(n=len(node_assignments), replace=False).index
features_df.loc[features_df.index.isin(to_assign), 'node'] = node_assignments
assigned_features = features_df[~features_df.node.isnull()]
features_df[~features_df.node.isnull()].shape

# finally, remove remaining features
final_features = features_df.dropna(axis=0)
final_features = final_features.set_index('node')
#final_features = final_features.drop('label', axis=1)



## Compute Labels
As a final step, I have to compute training and test splits for the nodes. Then, I have to construct labels for training and testing set and masks for both.

In [17]:
# construct y and masks for training and testing
labels = pd.get_dummies(final_features.label)
labels.columns = ['Non_Label', 'Label']
pos_train = labels[labels.Label == 1].sample(frac=0.7, replace=False)
if BALANCE:
    neg_train = labels[labels.Non_Label == 1].sample(pos_train.shape[0], replace=False)
else:
    neg_train = labels[labels.Non_Label == 1].sample(frac=0.7, replace=False)
train = pd.concat([pos_train, neg_train])
assert (pos_train.index.isin(final_features[final_features.label == 1].index).all())
assert (neg_train.index.isin(final_features[final_features.label == 0].index).all())
train_mask = final_features.index.isin(train.index)
test_mask = ~final_features.index.isin(train.index)
print (train_mask.sum(axis=0), test_mask.sum(axis=0))

737 316


In [18]:
train_nodes = train[train.Label == 1]
y_train = np.logical_and(final_features.label == 1, final_features.index.isin(train.index))
y_train = np.logical_not(pd.get_dummies(y_train).values)

test_nodes = labels[~labels.index.isin(pos_train.index)].index
pos_test_nodes =  final_features[final_features.index.isin(test_nodes) & final_features.label == 1]
y_test = pd.get_dummies(~final_features.index.isin(pos_test_nodes.index)).values
y_test.shape, y_test.sum(axis=0)
assert (final_features[np.logical_and(train_mask == 1, y_train[:, 0] == 1)].index.isin(train.index).all())
assert (final_features[np.logical_and(test_mask == 1, y_test[:, 0] == 1)].index.isin(test_nodes).all())

In [19]:
print (y_train.shape, y_test.shape)
print (train_mask.shape, test_mask.shape)
print (final_features.shape)

(1053, 2) (1053, 2)
(1053,) (1053,)
(1053, 25)


## Write To Container

In [20]:
y_train = np.array([y_train[:, 0]]).T
y_test = np.array([y_test[:, 0]]).T

In [21]:
network_np = nx.to_pandas_dataframe(network)
node_names = final_features.index.values
node_names = np.vstack([node_names, final_features.index.values]).transpose(1, 0) # just stack node numbers
real_disease_genes = np.array([i[0] for i in insert_positions])
final_features.drop('label', axis=1, inplace=True)

if BALANCE:
    fname = '../data/simulation/simulated_input_legionella_balanced.h5'
else:
    fname = '../data/simulation/simulated_input_legionella_unbalanced.h5'
f = h5py.File(fname, 'w')
f.create_dataset('network', data=network_np, shape=network_np.shape)
f.create_dataset('features', data=final_features, shape=final_features.shape)
f.create_dataset('gene_names', data=node_names, shape=node_names.shape)
f.create_dataset('real_disease_genes', data=real_disease_genes, shape=real_disease_genes.shape)

f.create_dataset('y_train', data=y_train, shape=y_train.shape)
f.create_dataset('y_test', data=y_test, shape=y_test.shape)
f.create_dataset('mask_train', data=train_mask, shape=train_mask.shape)
f.create_dataset('mask_test', data=test_mask, shape=test_mask.shape)
f.close()
print ("Container written to {}".format(fname))

Container written to ../data/simulation/simulated_input_legionella_unbalanced.h5


In [25]:
(y_train[:,0]*10).sum(axis=0)

270

In [28]:
with h5py.File('../data/simulation/simulated_input_legionella_unbalanced.h5', 'r') as f:
    network = f['network'][:]
    features = f['features'][:]
    node_names = f['gene_names'][:]
    y_train = f['y_train'][:]
    y_test = f['y_test'][:]
    if 'y_val' in f:
        y_val = f['y_val'][:]
    else:
        y_val = None
    train_mask = f['mask_train'][:]
    test_mask = f['mask_test'][:]
    if 'mask_val' in f:
        val_mask = f['mask_val'][:]
    else:
        val_mask = None
print (y_test.shape, y_test.sum(axis=0))
print (y_train.shape, y_train.sum(axis=0))
print (train_mask.shape, train_mask.sum(axis=0))
print (test_mask.shape, test_mask.sum(axis=0))

(1053, 2) [  11 1042]
(1053, 2) [  27 1026]
(1053,) 737
(1053,) 316


In [15]:
print (node_names.shape)

(1053, 2)
