# Join Adjacency Matrix, Gene Expression & Labels

In [1]:
import pandas
import numpy as np
from sklearn.decomposition import PCA
from sklearn import preprocessing
import matplotlib.pyplot as plt
np.set_printoptions(suppress=True)
import h5py

## Get Gene Expression

In [2]:
# params
TIMES_STD_THRESHOLD = 10.
MAX_ZEROS_ALLOWED = .5

In [3]:
# load data
gene_expression = pandas.DataFrame.from_csv('gene_expression/normalized-counts-labels.csv',
                                            encoding='utf-8', sep=',')
ge_nonames = gene_expression.drop('Name', axis=1)

# kick out super highly expressed genes
threshold = ge_nonames.mean(axis=1).std()*TIMES_STD_THRESHOLD
#print ("Threshold Gene Expression: {}".format(threshold))
anomalies = gene_expression[ge_nonames.mean(axis=1) > threshold]
ge_anomalies_removed = ge_nonames.drop(anomalies.index)

# kick out genes with too many zeros
ge_zeros_removed = ge_anomalies_removed[ge_anomalies_removed.astype('bool').mean(axis=1)>=(1-MAX_ZEROS_ALLOWED)]

# scaling
scaler = preprocessing.StandardScaler()
scaled_features = scaler.fit_transform(ge_zeros_removed)
ge_scaled = pandas.DataFrame(scaled_features,
                             index=ge_zeros_removed.index,
                             columns=ge_zeros_removed.columns)

# print some information
print ("Had gene expression for {} genes in the beginning.".format(ge_nonames.shape[0]))
print ("kicked out {} super highly expressed genes".format(ge_nonames.shape[0] - ge_anomalies_removed.shape[0]))
print ("Kicked out {} genes with more than {}% zeros".format(ge_anomalies_removed.shape[0]-ge_zeros_removed.shape[0],
                                                             MAX_ZEROS_ALLOWED*100.
                                                            ))
print ("==> Left with gene expression for {} genes".format(ge_scaled.shape[0]))

Had gene expression for 41424 genes in the beginning.
kicked out 9 super highly expressed genes
Kicked out 15021 genes with more than 50.0% zeros
==> Left with gene expression for 26394 genes


## Get PPI Network according to Gene Expression Data

In [4]:
gene_names = gene_expression.loc[ge_scaled.index].Name.values
gene_ids = ge_scaled.index
print (gene_names.shape, gene_ids.shape)

(26394,) (26394,)


In [5]:
# read interaction data
interactions = pandas.DataFrame.from_csv('ppi_network/ConsensusPathDB_human_PPI.csv',
                                         header=1,
                                         sep='\t',
                                         encoding='utf8'
                                        )
interactions_nona = interactions.dropna()

# construct Adjacency Matrix
N = len(gene_names)
print (N)
adjacency_matrix = np.zeros((N, N), np.uint8)
adj_df = pandas.DataFrame(adjacency_matrix, index=gene_names, columns=gene_names)

count = 0
for index, row in interactions_nona.iterrows():
    if row.interaction_confidence > 0.5:
        interaction_partners = row.interaction_participants.split(',')
        if len(interaction_partners) == 2:
            i1 = interaction_partners[0].split('_')[0].strip() # get rid of "_HUMAN" at end of prot name
            i2 = interaction_partners[1].split('_')[0].strip()
            if i1 in gene_names and i2 in gene_names:
                adj_df.ix[i1, i2] = 1
                adj_df.ix[i2, i1] = 1
            
    count += 1
    if count % 10000 == 0:
        print ("Processed {} out of {} rows".format(count, interactions_nona.shape[0]))

26394
Processed 10000 out of 272744 rows
Processed 20000 out of 272744 rows
Processed 30000 out of 272744 rows
Processed 40000 out of 272744 rows
Processed 50000 out of 272744 rows
Processed 60000 out of 272744 rows
Processed 70000 out of 272744 rows
Processed 80000 out of 272744 rows
Processed 90000 out of 272744 rows
Processed 100000 out of 272744 rows
Processed 110000 out of 272744 rows
Processed 120000 out of 272744 rows
Processed 130000 out of 272744 rows
Processed 140000 out of 272744 rows
Processed 150000 out of 272744 rows
Processed 160000 out of 272744 rows
Processed 170000 out of 272744 rows
Processed 180000 out of 272744 rows
Processed 190000 out of 272744 rows
Processed 200000 out of 272744 rows
Processed 210000 out of 272744 rows
Processed 220000 out of 272744 rows
Processed 230000 out of 272744 rows
Processed 240000 out of 272744 rows
Processed 250000 out of 272744 rows
Processed 260000 out of 272744 rows
Processed 270000 out of 272744 rows


In [6]:
adj_df.sum().sum()

87244

## Get Labels

## Generate Training, Testing & Validation Splits

## Write to hdf5 File on Disk

In [117]:
f = h5py.File('../data/preprocessing/ppi_networks.h5', 'w')
f.create_dataset('consensusPathDB_ppi', data=adj_df, shape=adj_df.shape)
f.create_dataset('gene_expression', data=ge_scaled, shape=ge_scaled.shape)
f.create_dataset('gene_names', data=gene_names_both, dtype=string_dt)

# doesn't exist yet :-(
#f.create_dataset('y_train', data=y_train, shape=y_train.shape)
#f.create_dataset('y_test', data=y_test, shape=y_test.shape)
#f.create_dataset('y_val', data=y_val, shape=y_val.shape)

#f.create_dataset('mask_train', data=mask_train, shape=mask_train.shape)
#f.create_dataset('mask_test', data=mask_test, shape=mask_test.shape)
#f.create_dataset('mask_val', data=mask_val, shape=mask_val.shape)
f.close()

ImportError: HDFStore requires PyTables, "No module named 'tables'" problem importing

In [118]:
fname = 'ppi_networks.h5'
with h5py.File(fname, 'r') as f:
    gene_expression_data = f['gene_expression'][:]
    ppi_network = f['consensusPathDB_ppi'][:]
    gene_names = f['gene_names'][:]

In [120]:
type(ppi_network)

numpy.ndarray

In [121]:
pr_val = np.array([3, 1, 2])
gene_names = np.array(['g1', 'g2', 'g3'])
sort_idx = pr_val.argsort()
gene_names_sorted = gene_names[sort_idx[::-1]]

In [122]:
gene_names_sorted

array(['g1', 'g3', 'g2'],
      dtype='<U2')

In [124]:
import networkx as nx

In [125]:
G = nx.DiGraph(nx.path_graph(4))
pr = nx.pagerank(G)

In [126]:
pr

{0: 0.17543839772251535,
 1: 0.32456160227748465,
 2: 0.32456160227748465,
 3: 0.17543839772251535}

In [128]:
import operator
pr_s = sorted(pr.items(), key=operator.itemgetter(1))[::-1]

In [131]:
gene_names_both

array([['TSPAN6', 'ENSG00000000003'],
       ['DPM1', 'ENSG00000000419'],
       ['SCYL3', 'ENSG00000000457'],
       ..., 
       ['CH17-132F21.5', 'ENSG00000281904'],
       ['RP11-439M15.1', 'ENSG00000281909'],
       ['LINC01144', 'ENSG00000281912']], dtype=object)

In [135]:
count = 1
for gene_idx, pr in pr_s:
    print ("{}\t{}\t{}\t{}\n".format(gene_names_both[gene_idx][1], gene_names_both[gene_idx][0], count, pr))
    count += 1

ENSG00000000457	SCYL3	1	0.32456160227748465

ENSG00000000419	DPM1	2	0.32456160227748465

ENSG00000000460	C1orf112	3	0.17543839772251535

ENSG00000000003	TSPAN6	4	0.17543839772251535



In [12]:
import tables

ModuleNotFoundError: No module named 'tables'