## Load Differential Expression from html & write to csv

In [1]:
import pandas as pd
import numpy as np

In [2]:
def load_diff_expr(file_name_up, file_name_down):
    """Loads differential up and down-regulated genes."""
    differential_expression_up = pd.read_html(file_name_up,
                                           index_col=2,
                                           header=0
                                          )[0]
    differential_expression_down = pd.read_html(file_name_down,
                                               index_col=2,
                                               header=0
                                               )[0]
    print ("Loaded Differential Expression from html...")

    # concatenate the up and down-regulated genes
    differential_expression_down.drop('Ensembl', inplace=True)
    differential_expression_down = differential_expression_down.convert_objects(convert_numeric=True)
    de = pd.concat([differential_expression_up, differential_expression_down])
    return de

de = load_diff_expr('../data/differential_expression/DEanalysis/gpp1605-up.html',
                    '../data/differential_expression/DEanalysis/gpp1605-down.html'
                   )

Loaded Differential Expression from html...




In [3]:
de.to_csv('../data/differential_expression/DEanalysis/gpp1605_both.tsv', sep='\t')

## Join differential expression with the network
I have differential expression for genes as well as the network. Now, I want to join the two together and obtain a personalization vector (a score for each of the 

In [4]:
# load gene expression, network and corresponding gene names
import h5py
fname = '../data/preprocessing/ppi_networks.h5'
with h5py.File(fname, 'r') as f:
    gene_expression_data = f['gene_expression'][:]
    ppi_network = f['consensusPathDB_ppi'][:]
    gene_names = f['gene_names'][:]

In [5]:
# add column with node numbers (as in the networkx graph) to the gene names
indices = np.arange(0, gene_names.shape[0]).reshape(gene_names.shape[0], 1)
gene_names_with_index = np.hstack([gene_names, indices])
gene_names_df = pd.DataFrame(gene_names_with_index[:, 1:],
                             index=gene_names_with_index[:, 0],
                             columns=['Gene-name', 'Node-number']
                            )

# join gene names and differential expression
names_with_de = gene_names_df.join(de, lsuffix='_left')
genes_zero_de = names_with_de.log2FoldChange.isnull().sum()
print ("{} genes in network don't have any differential expression values!".format(genes_zero_de))

# calculate random walk probabilities from log2FoldChange
names_with_de.ix[names_with_de.log2FoldChange.isnull(), 'log2FoldChange'] = 0
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

names_with_de['rw_prob'] = softmax(names_with_de.log2FoldChange)
names_with_de.head()

# construct dict which can be fed to the networkx pagerank algorithm
personalization = {row['Node-number']:row.rw_prob for ens, row in names_with_de.iterrows()}

5613 genes in network don't have any differential expression values!


In [8]:
names_with_de.shape, gene_names_df.shape, de.shape

((10367, 11), (10367, 2), (8877, 8))

In [17]:
# add column with node numbers (as in the networkx graph) to the gene names
indices = np.arange(0, gene_names.shape[0]).reshape(gene_names.shape[0], 1)
gene_names_with_index = np.hstack([gene_names, indices])
gene_names_df = pd.DataFrame(gene_names_with_index[:, 1:],
                             index=gene_names_with_index[:, 0],
                             columns=['Gene-name', 'Node-number']
                            )

# join gene names and differential expression
names_with_de = de.join(names_with_de, lsuffix='_left')
genes_zero_de = names_with_de.log2FoldChange.isnull().sum()
print ("{} genes in network don't have any differential expression values!".format(genes_zero_de))

4123 genes in network don't have any differential expression values!


In [22]:
t = names_with_de[names_with_de.log2FoldChange.isnull()]
t[t['Gene-type_left'] == 'protein_coding']

Unnamed: 0_level_0,Gene-type_left,Gene-name_left,baseMean_left,log2FoldChange_left,lfcSE_left,stat_left,pvalue_left,padj_left,Gene-name_left,Node-number,Gene-type,Gene-name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
Ensembl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ENSG00000110172,protein_coding,CHORDC1,1652.672338,4.377682,0.291540,15.015742,5.791062e-51,2.045238e-47,,,,,,,,,,
ENSG00000162783,protein_coding,IER5,4743.970707,5.870139,0.404073,14.527422,8.122010e-48,1.673269e-44,,,,,,,,,,
ENSG00000184378,protein_coding,ACTRT3,80.430582,5.983328,0.488226,12.255238,1.574862e-34,1.441991e-31,,,,,,,,,,
ENSG00000163376,protein_coding,KBTBD8,725.224687,4.002675,0.327013,12.240117,1.897600e-34,1.675445e-31,,,,,,,,,,
ENSG00000118946,protein_coding,PCDH17,49.290006,4.741085,0.435609,10.883804,1.376937e-27,6.947068e-25,,,,,,,,,,
ENSG00000275993,protein_coding,CH507-42P11.8,235.495664,4.706890,0.439696,10.704882,9.655373e-27,4.420373e-24,,,,,,,,,,
ENSG00000185222,protein_coding,WBP5,118.124339,4.548293,0.427704,10.634208,2.065762e-26,9.119600e-24,,,,,,,,,,
ENSG00000144655,protein_coding,CSRNP1,264.495098,4.555348,0.437198,10.419411,2.022010e-25,7.573961e-23,,,,,,,,,,
ENSG00000112149,protein_coding,CD83,1107.047595,4.552281,0.437363,10.408477,2.268217e-25,8.369381e-23,,,,,,,,,,
ENSG00000142871,protein_coding,CYR61,32.268183,5.906924,0.574520,10.281489,8.539807e-25,2.852988e-22,,,,,,,,,,


In [19]:
de[de['Gene-name'] == 'CHORDC1']

Unnamed: 0_level_0,Gene-type,Gene-name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
Ensembl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ENSG00000110172,protein_coding,CHORDC1,1652.672338,4.377682,0.29154,15.015742,5.791062e-51,2.045238e-47


In [21]:
gene_names_df.ix['ENSG00000110172']

KeyError: 'ENSG00000110172'