# Build Network
With NetSim, I have a tool that is capable of simulating a network. with implanted subnetworks.
Now, I want to use it already generated networks to build the input for training datafor a GCN.

This notebook will take a network and some RNA-seq counts to form a data set that can serve as input to a GCN model.
What it does:
1. Loading network, RNA-seq counts & differential expression
2. Using the insert positions from the network to assign the insert nodes with the most differentially expressed feature vectors
3. Form a h5py file that can serve as input for the GCN

In [97]:
import pandas as pd
import networkx as nx
import numpy as np
import h5py

## Loading the Data

In [98]:
# read the network and insert positions
network = nx.read_edgelist('../simulation/network.edgelist')
insert_positions = []
with open('../simulation/implant_positions.txt', 'r') as f:
    for line in f.readlines():
        if line.startswith('#'): # comment
            pass
        elif line.startswith('Subnetwork'):
            positions = line.split(':')[1].strip().split('\t')
            insert_positions.append([int(i) for i in positions])

# read differential expression (to get log2FoldChange as ranking)
de = pd.DataFrame.from_csv('../data/differential_expression/deseq2_gfppT16_vs_ControlT16.csv')
de.dropna(axis=0, inplace=True)

# read features (RNA-seq counts) & train/test masks
data_file = '../data/preprocessing/legionella_gcn_input.h5'
with h5py.File(data_file, 'r') as f:
    features = f['features'][:]
    node_names = f['gene_names'][:]
    y_train = f['y_train'][:]
    y_test = f['y_test'][:]
    if 'y_val' in f:
        y_val = f['y_val'][:]
    else:
        y_val = None
    train_mask = f['mask_train'][:]
    test_mask = f['mask_test'][:]
    if 'mask_val' in f:
        val_mask = f['mask_val'][:]
    else:
        val_mask = None
features_df = pd.DataFrame(features, index=node_names[:, 0])
# build label column for features
features_df['label'] = y_train[:, 0] | y_test[:, 0]
assert (features_df.label.sum() == len(insert_positions))

## Assign Real Features to Simulated Nodes

In [131]:
# assign disease genes to first nodes of subnetworks
features_df['node'] = np.nan
first_nodes = [i[0] for i in insert_positions]
features_df.loc[features_df.label == 1, 'node'] = first_nodes

# for each of the other nodes, assign the gene with highest differential expression
# first, remove disease genes from DE
labels = features_df[features_df.label == 1].index
diff_expr = de[de.index.isin(features_df.index) & ~de.index.isin(labels)]
sorted_de = diff_expr.sort_values(by='log2FoldChange', ascending=False)

# Now, use the DE log2FoldChange to find genes for neighbors of disease genes
idx = 0
assigned_nodes = list(features_df[~features_df.node.isnull()].node.values)
assert (len(assigned_nodes) == len(set(assigned_nodes))) # no duplicates here
for node_position in range(1, len(insert_positions[0])):
    for subnet in range(len(insert_positions)):
        ens_id = sorted_de.iloc[[idx]].index
        features_df.loc[ens_id, 'node'] = insert_positions[subnet][node_position]
        assigned_nodes.append(insert_positions[subnet][node_position])
        idx += 1

In [127]:
# Now, randomly assign features to the other nodes
features_not_assigned = features_df[features_df.node.isnull()]
already_assigned = np.array(insert_positions).flatten()
node_assignments = np.random.choice(list(network.nodes()),
                                    size=nx.number_of_nodes(network),
                                    replace=False)
node_assignments = [int(i) for i in node_assignments if not int(i) in already_assigned]

to_assign = features_df[features_df.node.isnull()].sample(n=len(node_assignments), replace=False).index
features_df.loc[features_df.index.isin(to_assign), 'node'] = node_assignments
assigned_features = features_df[~features_df.node.isnull()]
features_df[~features_df.node.isnull()].shape

# finally, remove remaining features and then we're done
final_features = features_df.dropna(axis=0)
final_features = final_features.set_index('node')
final_features = final_features.drop('label', axis=1)
final_features.shape

(1850, 26)

## Write To Container

In [150]:
network_np = nx.to_pandas_dataframe(network)
node_names = final_features.index
real_disease_genes = np.array([i[0] for i in insert_positions])

f = h5py.File('../data/simulation/simulated_input_legionella.h5', 'w')
f.create_dataset('network', data=network_np, shape=network_np.shape)
f.create_dataset('features', data=final_features, shape=final_features.shape)
f.create_dataset('gene_names', data=node_names, shape=node_names.shape)
f.create_dataset('real_disease_genes', data=real_disease_genes, shape=real_disease_genes.shape)

f.create_dataset('y_train', data=y_train, shape=y_train.shape)
f.create_dataset('y_test', data=y_test, shape=y_test.shape)
f.create_dataset('mask_train', data=train_mask, shape=train_mask.shape)
f.create_dataset('mask_test', data=test_mask, shape=test_mask.shape)
f.close()